1102 files changed, 64418 insertions, 25597 deletions
diff --git a/.bazelrc b/.bazelrc
index a2fe95822..e2848ef07 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -28,20 +28,12 @@ build:remote --bes_results_url="https://source.cloud.google.com/results/invocati
 build:remote --bes_timeout=600s
 build:remote --project_id=gvisor-rbe
 build:remote --remote_instance_name=projects/gvisor-rbe/instances/default_instance
-build:remote3 --remote_executor=grpcs://remotebuildexecution.googleapis.com
-build:remote3 --project_id=gvisor-rbe
-build:remote3 --bes_backend=buildeventservice.googleapis.com
-build:remote3 --bes_results_url="https://source.cloud.google.com/results/invocations"
-build:remote3 --bes_timeout=600s
-build:remote3 --remote_instance_name=projects/gvisor-rbe/instances/default_instance
 
 # Enable authentication. This will pick up application default credentials by
 # default. You can use --google_credentials=some_file.json to use a service
 # account credential instead.
 build:remote --google_default_credentials=true
 build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
-build:remote3 --google_default_credentials=true
-build:remote3 --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
 
 # Add a custom platform and toolchain that builds in a privileged docker
 # container, which is required by our syscall tests.
@@ -50,31 +42,5 @@ build:remote --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-defa
 build:remote --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604
 build:remote --platforms=//tools/bazeldefs:rbe_ubuntu1604
 build:remote --crosstool_top=@rbe_default//cc:toolchain
-build:remote --jobs=100
+build:remote --jobs=300
 build:remote --remote_timeout=3600
-build:remote3 --host_platform=//tools/bazeldefs:rbe_ubuntu1604_bazel3
-build:remote3 --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-default_bazel3
-build:remote3 --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3
-build:remote3 --platforms=//tools/bazeldefs:rbe_ubuntu1604_bazel3
-build:remote3 --crosstool_top=@rbe_default//cc:toolchain
-build:remote3 --jobs=100
-build:remote3 --remote_timeout=3600
-
-# Set flags for uploading to BES in order to view results in the Bazel Build
-# Results UI.
-build:results --bes_backend="buildeventservice.googleapis.com"
-build:results --bes_timeout=60s
-build:results --tls_enabled
-
-# Output BES results url
-build:results --bes_results_url="https://source.cloud.google.com/results/invocations/"
-
-# Set flags for uploading to BES without Remote Build Execution.
-build:results-local --bes_backend="buildeventservice.googleapis.com"
-build:results-local --bes_timeout=60s
-build:results-local --tls_enabled=true
-build:results-local --auth_enabled=true
-build:results-local --spawn_strategy=local
-build:results-local --remote_cache=remotebuildexecution.googleapis.com
-build:results-local --remote_timeout=3600
-build:results-local --bes_results_url="https://source.cloud.google.com/results/invocations/"
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 49a1ba697..50d187633 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -23,7 +23,7 @@ reproduced with software that is publicly available.
 
 Please include the following details of your environment:
 
-*   `runsc -v`
+*   `runsc -version`
 *   `docker version` or `docker info` (if available)
 *   `kubectl version` and `kubectl get nodes` (if using Kubernetes)
 *   `uname -a`
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cf782a580..e28e46352 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,9 +3,11 @@ on:
   push:
     branches:
       - master
+      - feature/**
   pull_request:
     branches:
       - master
+      - feature/**
 
 jobs:
   default:
@@ -19,3 +21,8 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-bazel-
     - run: make
+    - run: make build OPTIONS="--build_tag_filters nogo" TARGETS="//..."
+    - run: make run TARGETS="//tools/github" ARGS="-path=bazel-bin/ nogo"
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_REPOSITORY: ${{ github.repository }}
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 4da3853b2..3a6a592d1 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -6,6 +6,7 @@ on:
   pull_request:
     branches:
       - master
+      - feature/**
 
 jobs:
   generate:
@@ -49,7 +50,12 @@ jobs:
         key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }}
         restore-keys: |
           ${{ runner.os }}-bazel-
+      # Create gopath to merge the changes. The first execution will create
+      # symlinks to the cache, e.g. bazel-bin. Once the cache is setup, delete
+      # old gopath files that may exist from previous runs (and could contain
+      # files that are now deleted). Then run gopath again for good.
     - run: |
+        make build TARGETS="//:gopath"
         rm -rf bazel-bin/gopath
         make build TARGETS="//:gopath"
     - run: tools/go_branch.sh
diff --git a/.github/workflows/issue_reviver.yml b/.github/workflows/issue_reviver.yml
index 2b399a3f2..c53185620 100644
--- a/.github/workflows/issue_reviver.yml
+++ b/.github/workflows/issue_reviver.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
       if: github.repository == 'google/gvisor'
-    - run: make run TARGETS="//tools/issue_reviver"
+    - run: make run TARGETS="//tools/github" ARGS="revive"
       if: github.repository == 'google/gvisor'
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 13babef4d..a56f6ebcd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 # Generated bazel symlinks.
-/bazel-*
+/bazel-*
+\ No newline at end of file
diff --git a/BUILD b/BUILD
index 2639f8169..153464220 100644
--- a/BUILD
+++ b/BUILD
@@ -1,10 +1,17 @@
 load("//tools:defs.bzl", "build_test", "gazelle", "go_path")
+load("//tools/nogo:defs.bzl", "nogo_config")
 load("//website:defs.bzl", "doc")
 
 package(licenses = ["notice"])
 
 exports_files(["LICENSE"])
 
+nogo_config(
+    name = "nogo_config",
+    srcs = ["nogo.yaml"],
+    visibility = ["//:sandbox"],
+)
+
 doc(
     name = "contributing",
     src = "CONTRIBUTING.md",
@@ -75,12 +82,19 @@ go_path(
     name = "gopath",
     mode = "link",
     deps = [
-        # Main binary.
-        "//runsc",
-        "//shim/v1:gvisor-containerd-shim",
-        "//shim/v2:containerd-shim-runsc-v1",
+        # Main binaries.
+        #
+        # For reasons related to reproducibility of the generated
+        # files, in order to ensure that :gopath produces only a
+        # a single "pure" version of all files, we can only depend
+        # on go_library targets here, and not go_binary. Thus the
+        # binaries have been factored into a cli package, which is
+        # a good practice in any case.
+        "//runsc/cli",
+        "//shim/v1/cli",
+        "//shim/v2/cli",
 
-        # Packages that are not dependencies of //runsc.
+        # Packages that are not dependencies of the above.
         "//pkg/sentry/kernel/memevent",
         "//pkg/tcpip/adapters/gonet",
         "//pkg/tcpip/link/channel",
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 89180eb3f..c53df7d25 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,10 +70,8 @@ Rules:
     *   `@org_golang_x_sys//unix:go_default_library` (Go import
         `golang.org/x/sys/unix`).
     *   Generated Go protobuf packages.
-    *   `@com_github_golang_protobuf//proto:go_default_library` (Go import
-        `github.com/golang/protobuf/proto`).
-    *   `@com_github_golang_protobuf//ptypes:go_default_library` (Go import
-        `github.com/golang/protobuf/ptypes`).
+    *   `@org_golang_google_protobuf//proto:go_default_library` (Go import
+        `google.golang.org/protobuf`).
 
 *   `runsc` may only depend on the following packages:
 
diff --git a/Makefile b/Makefile
index fdbc6fb41..88f23de8d 100644
--- a/Makefile
+++ b/Makefile
@@ -94,9 +94,9 @@ endef
 rebuild-...: ## Rebuild the given image. Also may use 'rebuild-all-images'.
 $(eval $(call images,rebuild))
 push-...: ## Push the given image. Also may use 'push-all-images'.
-$(eval $(call images,pull))
-pull-...: ## Pull the given image. Also may use 'pull-all-images'.
 $(eval $(call images,push))
+pull-...: ## Pull the given image. Also may use 'pull-all-images'.
+$(eval $(call images,pull))
 load-...: ## Load (pull or rebuild) the given image. Also may use 'load-all-images'.
 $(eval $(call images,load))
 list-images: ## List all available images.
@@ -114,25 +114,28 @@ runsc: ## Builds the runsc binary.
 .PHONY: runsc
 
 debian: ## Builds the debian packages.
-	@$(call submake,build OPTIONS="-c opt" TARGETS="//runsc:runsc-debian")
+	@$(call submake,build OPTIONS="-c opt" TARGETS="//debian:debian")
 .PHONY: debian
 
 smoke-tests: ## Runs a simple smoke test after build runsc.
 	@$(call submake,run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true")
 .PHONY: smoke-tests
 
+fuse-tests:
+	@$(call submake,test OPTIONS="--test_tag_filters fuse" TARGETS="test/fuse/...")
+.PHONY: fuse-tests
+
 unit-tests: ## Local package unit tests in pkg/..., runsc/, tools/.., etc.
 	@$(call submake,test TARGETS="pkg/... runsc/... tools/...")
+.PHONY: unit-tests
 
 tests: ## Runs all unit tests and syscall tests.
-tests: unit-tests
-	@$(call submake,test TARGETS="test/syscalls/...")
+tests: unit-tests syscall-tests
 .PHONY: tests
 
-
 integration-tests: ## Run all standard integration tests.
 integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests
-integration-tests: do-tests kvm-tests root-tests containerd-tests
+integration-tests: do-tests kvm-tests containerd-test-1.3.4
 .PHONY: integration-tests
 
 network-tests: ## Run all networking integration tests.
@@ -143,20 +146,23 @@ network-tests: iptables-tests packetdrill-tests packetimpact-tests
 INTEGRATION_TARGETS := //test/image:image_test //test/e2e:integration_test
 
 syscall-%-tests:
-	@$(call submake,test OPTIONS="--test_tag_filters runsc_$* test/syscalls/...")
+	@$(call submake,test OPTIONS="--test_tag_filters runsc_$*" TARGETS="test/syscalls/...")
 
 syscall-native-tests:
-	@$(call submake,test OPTIONS="--test_tag_filters native test/syscalls/...")
+	@$(call submake,test OPTIONS="--test_tag_filters native" TARGETS="test/syscalls/...")
 .PHONY: syscall-native-tests
 
 syscall-tests: ## Run all system call tests.
-syscall-tests: syscall-ptrace-tests syscall-kvm-tests syscall-native-tests
-.PHONY: syscall-tests
+	@$(call submake,test TARGETS="test/syscalls/...")
 
 %-runtime-tests: load-runtimes_%
 	@$(call submake,install-test-runtime)
 	@$(call submake,test-runtime OPTIONS="--test_timeout=10800" TARGETS="//test/runtimes:$*")
 
+%-runtime-tests_vfs2: load-runtimes_%
+	@$(call submake,install-test-runtime RUNTIME="vfs2" ARGS="--vfs2")
+	@$(call submake,test-runtime RUNTIME="vfs2" OPTIONS="--test_timeout=10800" TARGETS="//test/runtimes:$*")
+
 do-tests: runsc
 	@$(call submake,run TARGETS="//runsc" ARGS="--rootless do true")
 	@$(call submake,run TARGETS="//runsc" ARGS="--rootless -network=none do true")
@@ -182,6 +188,7 @@ swgso-tests: load-basic-images
 	@$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false")
 	@$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)")
 .PHONY: swgso-tests
+
 hostnet-tests: load-basic-images
 	@$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host")
 	@$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)")
@@ -196,6 +203,8 @@ kvm-tests: load-basic-images
 .PHONY: kvm-tests
 
 iptables-tests: load-iptables
+	@sudo modprobe iptable_filter
+	@sudo modprobe ip6table_filter
 	@$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test")
 	@$(call submake,install-test-runtime RUNTIME="iptables" ARGS="--net-raw")
 	@$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test")
@@ -207,21 +216,18 @@ packetdrill-tests: load-packetdrill
 .PHONY: packetdrill-tests
 
 packetimpact-tests: load-packetimpact
-	@sudo modprobe iptable_filter ip6table_filter
+	@sudo modprobe iptable_filter
+	@sudo modprobe ip6table_filter
 	@$(call submake,install-test-runtime RUNTIME="packetimpact")
 	@$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')")
 .PHONY: packetimpact-tests
 
-root-tests: load-basic-images
-	@$(call submake,install-test-runtime)
-	@$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v")
-.PHONY: root-tests
-
 # Specific containerd version tests.
-containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime
+containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu
+	@$(call submake,install-test-runtime RUNTIME="root")
 	@CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd"
 	@$(MAKE) sudo TARGETS="tools/installers:shim"
-	@$(MAKE) sudo TARGETS="test/root:root_test" ARGS="-test.v"
+	@$(MAKE) sudo TARGETS="test/root:root_test" ARGS="--runtime=root -test.v"
 
 # Note that we can't run containerd-test-1.1.8 tests here.
 #
@@ -250,15 +256,15 @@ WEBSITE_PROJECT := gvisordev
 WEBSITE_REGION  := us-central1
 
 website-build: load-jekyll ## Build the site image locally.
-	@$(call submake,run TARGETS="//website:website")
+	@$(call submake,run TARGETS="//website:website" ARGS="$(WEBSITE_IMAGE)")
 .PHONY: website-build
 
 website-server: website-build ## Run a local server for development.
-	@docker run -i -p 8080:8080 gvisor.dev/images/website
+	@docker run -i -p 8080:8080 $(WEBSITE_IMAGE)
 .PHONY: website-server
 
 website-push: website-build ## Push a new image and update the service.
-	@docker tag gvisor.dev/images/website $(WEBSITE_IMAGE) && docker push $(WEBSITE_IMAGE)
+	@docker push $(WEBSITE_IMAGE)
 .PHONY: website-push
 
 website-deploy: website-push ## Deploy a new version of the website.
@@ -294,15 +300,17 @@ $(RELEASE_KEY):
 	echo Name-Email: test@example.com >> $$C && \
 	echo Expire-Date: 0 >> $$C && \
 	echo %commit >> $$C && \
-	gpg --batch $(GPG_TEST_OPTIONS) --passphrase '' --no-default-keyring --keyring $$T --no-tty --gen-key $$C && \
-	gpg --batch $(GPG_TEST_OPTIONS) --export-secret-keys --no-default-keyring --keyring $$T --secret-keyring $$T > $@; \
+	gpg --batch $(GPG_TEST_OPTIONS) --passphrase '' --no-default-keyring --secret-keyring $$T --no-tty --gen-key $$C && \
+	gpg --batch $(GPG_TEST_OPTIONS) --export-secret-keys --no-default-keyring --secret-keyring $$T > $@; \
 	rc=$$?; rm -f $$T $$C; exit $$rc
 
 release: $(RELEASE_KEY) ## Builds a release.
 	@mkdir -p $(RELEASE_ROOT)
 	@T=$$(mktemp -d /tmp/release.XXXXXX); \
-	  $(call submake,copy TARGETS="runsc" DESTINATION=$$T) && \
-	  $(call submake,copy TARGETS="runsc:runsc-debian" DESTINATION=$$T) && \
+	  $(call submake,copy TARGETS="//runsc:runsc" DESTINATION=$$T) && \
+	  $(call submake,copy TARGETS="//shim/v1:gvisor-containerd-shim" DESTINATION=$$T) && \
+	  $(call submake,copy TARGETS="//shim/v2:containerd-shim-runsc-v1" DESTINATION=$$T) && \
+	  $(call submake,copy TARGETS="//debian:debian" DESTINATION=$$T) && \
 	  NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \
 	rc=$$?; rm -rf $$T; exit $$rc
 .PHONY: release
@@ -369,3 +377,12 @@ configure: ## Configures a single runtime. Requires sudo. Typically called from
 test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided.
 	@$(call submake,test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)")
 .PHONY: test-runtime
+
+nogo: ## Surfaces all nogo findings.
+	@$(call submake,build OPTIONS="--build_tag_filters nogo" TARGETS="//...")
+	@$(call submake,run TARGETS="//tools/github" ARGS="$(foreach dir,$(BUILD_ROOTS),-path=$(CURDIR)/$(dir)) -dry-run nogo")
+.PHONY: nogo
+
+gazelle: ## Runs gazelle to update WORKSPACE.
+	@$(call submake,run TARGETS="//:gazelle" ARGS="update-repos -from_file=go.mod -prune")
+.PHONY: gazelle
diff --git a/README.md b/README.md
index ed9e0e92b..0a79e2cff 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 ![](https://github.com/google/gvisor/workflows/Build/badge.svg)
 [![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community)
+[![code search](https://img.shields.io/badge/code-search-blue)](https://cs.opensource.google/gvisor/gvisor)
 
 ## What is gVisor?
 
diff --git a/WORKSPACE b/WORKSPACE
index 6dc060bd5..30d21e472 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -20,47 +20,28 @@ bazel_skylib_workspace()
 # Note that this repository actually patches some other Go repositories as it
 # loads it, in order to limit visibility. We hack this process by patching the
 # patch used by the Go rules, turning the trick against itself.
+
 http_archive(
     name = "io_bazel_rules_go",
+    sha256 = "b725e6497741d7fc2d55fcc29a276627d10e43fa5d0bb692692890ae30d98d00",
     patch_args = ["-p1"],
     patches = [
-        "//tools/nogo:io_bazel_rules_go-visibility.patch",
+        # Newer versions of the rules_go rules will automatically strip test
+        # binaries of symbols, which we don't want.
+        "//tools:rules_go.patch",
     ],
-    sha256 = "db2b2d35293f405430f553bc7a865a8749a8ef60c30287e90d2b278c32771afe",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.24.3/rules_go-v0.24.3.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.24.3/rules_go-v0.24.3.tar.gz",
     ],
 )
 
 http_archive(
     name = "bazel_gazelle",
-    sha256 = "d8c45ee70ec39a57e7a05e5027c32b1576cc7f16d9dd37135b0eddde45cf1b10",
-    urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/bazel-gazelle/releases/download/v0.20.0/bazel-gazelle-v0.20.0.tar.gz",
-        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.20.0/bazel-gazelle-v0.20.0.tar.gz",
-    ],
-)
-
-http_archive(
-    name = "io_bazel_rules_go_bazel3", # To replace the above.
-    patch_args = ["-p1"],
-    patches = [
-        "//tools/nogo:io_bazel_rules_go-visibility.patch",
-    ],
-    sha256 = "87f0fb9747854cb76a0a82430adccb6269f7d394237104a4523b51061c469171",
+    sha256 = "b85f48fa105c4403326e9525ad2b2cc437babaa6e15a3fc0b1dbab0ab064bc7c",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.23.1/rules_go-v0.23.1.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.23.1/rules_go-v0.23.1.tar.gz",
-    ],
-)
-
-http_archive(
-    name = "bazel_gazelle_bazel3", # To replace the above.
-    sha256 = "bfd86b3cbe855d6c16c6fce60d76bd51f5c8dbc9cfcaef7a2bb5c1aafd0710e8",
-    urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.21.0/bazel-gazelle-v0.21.0.tar.gz",
-        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.21.0/bazel-gazelle-v0.21.0.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-gazelle/releases/download/v0.22.2/bazel-gazelle-v0.22.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.22.2/bazel-gazelle-v0.22.2.tar.gz",
     ],
 )
 
@@ -81,8 +62,8 @@ gazelle_dependencies()
 go_repository(
     name = "org_golang_x_sys",
     importpath = "golang.org/x/sys",
-    sum = "h1:uYVVQ9WP/Ds2ROhcaGPeIdVq0RIXVLwsHlnvJ+cT1So=",
-    version = "v0.0.0-20200302150141-5c8b2ff67527",
+    sum = "h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884=",
+    version = "v0.0.0-20200323222414-85ca7c5b95cd",
 )
 
 # Load C++ rules.
@@ -117,16 +98,6 @@ rules_proto_toolchains()
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
     name = "bazel_toolchains",
-    sha256 = "239a1a673861eabf988e9804f45da3b94da28d1aff05c373b013193c315d9d9e",
-    strip_prefix = "bazel-toolchains-3.0.1",
-    urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
-    ],
-)
-
-http_archive(
-    name = "bazel_toolchains_bazel3", # To replace the above.
     sha256 = "144290c4166bd67e76a54f96cd504ed86416ca3ca82030282760f0823c10be48",
     strip_prefix = "bazel-toolchains-3.1.1",
     urls = [
@@ -208,9 +179,8 @@ http_archive(
 go_repository(
     name = "com_github_sirupsen_logrus",
     importpath = "github.com/sirupsen/logrus",
-    replace = "github.com/Sirupsen/logrus",
-    sum = "h1:cWjBmzJnL1sO88XdqJYmq7aiWClqXIQQMJ3Utgy1f+I=",
-    version = "v1.4.2",
+    sum = "h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=",
+    version = "v1.6.0",
 )
 
 go_repository(
@@ -330,8 +300,8 @@ go_repository(
 go_repository(
     name = "org_golang_x_net",
     importpath = "golang.org/x/net",
-    sum = "h1:vGXIOMxbNfDTk/aXCmfdLgkrSV+Z2tcbze+pEc3v5W4=",
-    version = "v0.0.0-20200625001655-4c5254603344",
+    sum = "h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA=",
+    version = "v0.0.0-20200822124328-c89045814202",
 )
 
 go_repository(
@@ -358,15 +328,15 @@ go_repository(
 go_repository(
     name = "org_golang_x_tools",
     importpath = "golang.org/x/tools",
-    sum = "h1:YAl/dx/kLsMMIWGqfhFHW9ckqGhmq7Ki0dfoKAgvFTE=",
-    version = "v0.0.0-20200707200213-416e8f4faf8a",
+    sum = "h1:vWQvJ/Z0Lu+9/8oQ/pAYXNzbc7CMnBl+tULGVHOy3oE=",
+    version = "v0.0.0-20201002184944-ecd9fd270d5d",
 )
 
 go_repository(
     name = "org_golang_x_xerrors",
     importpath = "golang.org/x/xerrors",
-    sum = "h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=",
-    version = "v0.0.0-20191204190536-9bdfabe68543",
+    sum = "h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=",
+    version = "v0.0.0-20200804184101-5ec99f83aff1",
 )
 
 go_repository(
@@ -442,8 +412,8 @@ go_repository(
 go_repository(
     name = "com_github_konsorten_go_windows_terminal_sequences",
     importpath = "github.com/konsorten/go-windows-terminal-sequences",
-    sum = "h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=",
-    version = "v1.0.2",
+    sum = "h1:vWQvJ/Z0Lu+9/8oQ/pAYXNzbc7CMnBl+tULGVHOy3oE=",
+    version = "v1.0.3",
 )
 
 go_repository(
@@ -470,8 +440,8 @@ go_repository(
 go_repository(
     name = "com_github_microsoft_go_winio",
     importpath = "github.com/Microsoft/go-winio",
-    sum = "h1:ygIc8M6trr62pF5DucadTWGdEB4mEyvzi0e2nbcmcyA=",
-    version = "v0.4.15-0.20190919025122-fc70bd9a86b5",
+    sum = "h1:9pygWVFqbY9lPxM0peffumuVDyMuIMzNLyO9uFjJuQo=",
+    version = "v0.4.15-0.20200908182639-5b44b70ab3ab",
 )
 
 go_repository(
@@ -484,8 +454,8 @@ go_repository(
 go_repository(
     name = "org_uber_go_atomic",
     importpath = "go.uber.org/atomic",
-    sum = "h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=",
-    version = "v1.6.0",
+    sum = "h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=",
+    version = "v1.7.0",
 )
 
 go_repository(
@@ -562,8 +532,8 @@ go_repository(
 go_repository(
     name = "com_github_containerd_continuity",
     importpath = "github.com/containerd/continuity",
-    sum = "h1:PEmIrUvwG9Yyv+0WKZqjXfSFDeZjs/q15g0m08BYS9k=",
-    version = "v0.0.0-20200710164510-efbc4488d8fe",
+    sum = "h1:jEIoR0aA5GogXZ8pP3DUzE+zrhaF6/1rYZy+7KkYEWM=",
+    version = "v0.0.0-20200928162600-f2cc35102c2a",
 )
 
 go_repository(
@@ -604,8 +574,8 @@ go_repository(
 go_repository(
     name = "com_github_dustin_go_humanize",
     importpath = "github.com/dustin/go-humanize",
-    sum = "h1:qk/FSDDxo05wdJH28W+p5yivv7LuLYLRXPPD8KQCtZs=",
-    version = "v0.0.0-20171111073723-bb3d318650d4",
+    sum = "h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=",
+    version = "v1.0.0",
 )
 
 go_repository(
@@ -623,13 +593,6 @@ go_repository(
 )
 
 go_repository(
-    name = "com_github_fsnotify_fsnotify",
-    importpath = "github.com/fsnotify/fsnotify",
-    sum = "h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=",
-    version = "v1.4.7",
-)
-
-go_repository(
     name = "com_github_godbus_dbus",
     importpath = "github.com/godbus/dbus",
     sum = "h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8=",
@@ -660,8 +623,8 @@ go_repository(
 go_repository(
     name = "com_github_google_go_cmp",
     importpath = "github.com/google/go-cmp",
-    sum = "h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=",
-    version = "v0.5.0",
+    sum = "h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k=",
+    version = "v0.5.1",
 )
 
 go_repository(
@@ -686,13 +649,6 @@ go_repository(
 )
 
 go_repository(
-    name = "com_github_hpcloud_tail",
-    importpath = "github.com/hpcloud/tail",
-    sum = "h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=",
-    version = "v1.0.0",
-)
-
-go_repository(
     name = "com_github_inconshreveable_mousetrap",
     importpath = "github.com/inconshreveable/mousetrap",
     sum = "h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=",
@@ -721,20 +677,6 @@ go_repository(
 )
 
 go_repository(
-    name = "com_github_onsi_ginkgo",
-    importpath = "github.com/onsi/ginkgo",
-    sum = "h1:q/mM8GF/n0shIN8SaAZ0V+jnLPzen6WIVZdiwrRlMlo=",
-    version = "v1.10.1",
-)
-
-go_repository(
-    name = "com_github_onsi_gomega",
-    importpath = "github.com/onsi/gomega",
-    sum = "h1:XPnZz8VVBHjVsy1vzJmRwIcSwiUO+JFfrv/xGiigmME=",
-    version = "v1.7.0",
-)
-
-go_repository(
     name = "com_github_opencontainers_runc",
     importpath = "github.com/opencontainers/runc",
     sum = "h1:GlxAyO6x8rfZYN9Tt0Kti5a/cP41iuiO2yYT0IJGY8Y=",
@@ -786,43 +728,15 @@ go_repository(
 go_repository(
     name = "com_github_urfave_cli",
     importpath = "github.com/urfave/cli",
-    sum = "h1:MCfT24H3f//U5+UCrZp1/riVO3B50BovxtDiNn0XKkk=",
-    version = "v0.0.0-20171014202726-7bc6a0acffa5",
+    sum = "h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo=",
+    version = "v1.22.2",
 )
 
 go_repository(
     name = "com_github_yuin_goldmark",
     importpath = "github.com/yuin/goldmark",
-    sum = "h1:5tjfNdR2ki3yYQ842+eX2sQHeiwpKJ0RnHO4IYOc4V8=",
-    version = "v1.1.32",
-)
-
-go_repository(
-    name = "in_gopkg_airbrake_gobrake_v2",
-    importpath = "gopkg.in/airbrake/gobrake.v2",
-    sum = "h1:7z2uVWwn7oVeeugY1DtlPAy5H+KYgB1KeKTnqjNatLo=",
-    version = "v2.0.9",
-)
-
-go_repository(
-    name = "in_gopkg_fsnotify_v1",
-    importpath = "gopkg.in/fsnotify.v1",
-    sum = "h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=",
-    version = "v1.4.7",
-)
-
-go_repository(
-    name = "in_gopkg_gemnasium_logrus_airbrake_hook_v2",
-    importpath = "gopkg.in/gemnasium/logrus-airbrake-hook.v2",
-    sum = "h1:OAj3g0cR6Dx/R07QgQe8wkA9RNjB2u4i700xBkIT4e0=",
-    version = "v2.1.2",
-)
-
-go_repository(
-    name = "in_gopkg_tomb_v1",
-    importpath = "gopkg.in/tomb.v1",
-    sum = "h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=",
-    version = "v1.0.0-20141024135613-dd632973f1e7",
+    sum = "h1:ruQGxdhGHe7FWOJPT0mKs5+pD2Xs1Bm/kdGlHO04FmM=",
+    version = "v1.2.1",
 )
 
 go_repository(
@@ -849,15 +763,15 @@ go_repository(
 go_repository(
     name = "org_golang_google_genproto",
     importpath = "google.golang.org/genproto",
-    sum = "h1:wDju+RU97qa0FZT0QnZDg9Uc2dH0Ql513kFvHocz+WM=",
-    version = "v0.0.0-20200117163144-32f20d992d24",
+    sum = "h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=",
+    version = "v0.0.0-20200526211855-cb27e3aa2013",
 )
 
 go_repository(
     name = "org_golang_google_protobuf",
     importpath = "google.golang.org/protobuf",
-    sum = "h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=",
-    version = "v1.23.0",
+    sum = "h1:poC0iCcx0QXFYlS6nuq/8K+Ng5T55k0FXdzq52hVi4w=",
+    version = "v1.25.1-0.20200808011614-a180de9f97d9",
 )
 
 go_repository(
@@ -1024,8 +938,8 @@ go_repository(
 go_repository(
     name = "com_github_vishvananda_netns",
     importpath = "github.com/vishvananda/netns",
-    sum = "h1:mjAZxE1nh8yvuwhGHpdDqdhtNu2dgbpk93TwoXuk5so=",
-    version = "v0.0.0-20200520041808-52d707b772fe",
+    sum = "h1:4hwBBUfQCFe3Cym0ZtKyq7L16eZUtYKs+BaHDN6mAns=",
+    version = "v0.0.0-20200728191858-db3c7e526aae",
 )
 
 go_repository(
@@ -1073,6 +987,48 @@ go_repository(
 go_repository(
     name = "com_github_dpjacques_clockwork",
     importpath = "github.com/dpjacques/clockwork",
-    sum = "h1:7krODee+eIlZYoLiEDmP1kLFNCvd0bQ0eEXOympdN6U=",
-    version = "v0.1.1-0.20190114191937-d864eecc357b",
+    sum = "h1:l+j1wSnHcimOzeeKxtspsl6tCBTyikdYxcWqFZ+Ho2c=",
+    version = "v0.1.1-0.20200827220843-c1f524b839be",
+)
+
+go_repository(
+    name = "com_github_cilium_ebpf",
+    importpath = "github.com/cilium/ebpf",
+    sum = "h1:i8+1fuPLjSgAYXUyBlHNhFwjcfAsP4ufiuH1+PWkyDU=",
+    version = "v0.0.0-20200110133405-4032b1d8aae3",
+)
+
+go_repository(
+    name = "com_github_coreos_go_systemd_v22",
+    importpath = "github.com/coreos/go-systemd/v22",
+    sum = "h1:XJIw/+VlJ+87J+doOxznsAWIdmWuViOVhkQamW5YV28=",
+    version = "v22.0.0",
+)
+
+go_repository(
+    name = "com_github_cpuguy83_go_md2man_v2",
+    importpath = "github.com/cpuguy83/go-md2man/v2",
+    sum = "h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM=",
+    version = "v2.0.0",
+)
+
+go_repository(
+    name = "com_github_godbus_dbus_v5",
+    importpath = "github.com/godbus/dbus/v5",
+    sum = "h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME=",
+    version = "v5.0.3",
+)
+
+go_repository(
+    name = "com_github_russross_blackfriday_v2",
+    importpath = "github.com/russross/blackfriday/v2",
+    sum = "h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=",
+    version = "v2.0.1",
+)
+
+go_repository(
+    name = "com_github_shurcool_sanitized_anchor_name",
+    importpath = "github.com/shurcooL/sanitized_anchor_name",
+    sum = "h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=",
+    version = "v1.0.0",
 )
diff --git a/debian/BUILD b/debian/BUILD
new file mode 100644
index 000000000..331f44a5c
--- /dev/null
+++ b/debian/BUILD
@@ -0,0 +1,59 @@
+load("//tools:defs.bzl", "pkg_deb", "pkg_tar")
+
+package(licenses = ["notice"])
+
+pkg_tar(
+    name = "debian-bin",
+    srcs = [
+        "//runsc",
+        "//shim/v1:gvisor-containerd-shim",
+        "//shim/v2:containerd-shim-runsc-v1",
+    ],
+    mode = "0755",
+    package_dir = "/usr/bin",
+)
+
+pkg_tar(
+    name = "debian-data",
+    extension = "tar.gz",
+    deps = [
+        ":debian-bin",
+        "//shim:config",
+    ],
+)
+
+genrule(
+    name = "debian-version",
+    # Note that runsc must appear in the srcs parameter and not the tools
+    # parameter, otherwise it will not be stamped. This is reasonable, as tools
+    # may be encoded differently in the build graph (cached more aggressively
+    # because they are assumes to be hermetic).
+    srcs = ["//runsc"],
+    outs = ["version.txt"],
+    # Note that the little dance here is necessary because files in the $(SRCS)
+    # attribute are not executable by default, and we can't touch in place.
+    cmd = "cp $(location //runsc:runsc) $(@D)/runsc && \
+        chmod a+x $(@D)/runsc && \
+        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
+        rm -f $(@D)/runsc",
+    stamp = 1,
+)
+
+pkg_deb(
+    name = "debian",
+    architecture = "amd64",
+    data = ":debian-data",
+    # Note that the description_file will be flatten (all newlines removed),
+    # and therefore it is kept to a simple one-line description. The expected
+    # format for debian packages is "short summary\nLonger explanation of
+    # tool." and this is impossible with the flattening.
+    description_file = "description",
+    homepage = "https://gvisor.dev/",
+    maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
+    package = "runsc",
+    postinst = "postinst.sh",
+    version_file = ":version.txt",
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/runsc/debian/description b/debian/description
index 9e8e08805..9e8e08805 100644
--- a/runsc/debian/description
+++ b/debian/description
diff --git a/runsc/debian/postinst.sh b/debian/postinst.sh
index d1e28e17b..6a326f823 100755
--- a/runsc/debian/postinst.sh
+++ b/debian/postinst.sh
@@ -21,7 +21,7 @@ fi
 # Update docker configuration.
 if [ -f /etc/docker/daemon.json ]; then
   runsc install
-  if systemctl status docker 2>/dev/null; then
+  if systemctl is-active -q docker; then
     systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
   fi
 fi
diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md
index 39dbb0045..b981f0c01 100644
--- a/g3doc/architecture_guide/performance.md
+++ b/g3doc/architecture_guide/performance.md
@@ -30,7 +30,7 @@ is distinct from **structural costs**. Improvements here are ongoing and driven
 by the workloads that matter to gVisor users and contributors.
 
 This page provides a guide for understanding baseline performance, and calls out
-distint **structural costs** and **implementation costs**, highlighting where
+distinct **structural costs** and **implementation costs**, highlighting where
 improvements are possible and not possible.
 
 While we include a variety of workloads here, it’s worth emphasizing that gVisor
@@ -211,7 +211,7 @@ url="/performance/applications.csv" title="perf.py http.(node|ruby)
 
 The above figure shows the result of simple `node` and `ruby` web services that
 render a template upon receiving a request. Because these synthetic benchmarks
-do minimal work per request, must like the `redis` case, they suffer from high
+do minimal work per request, much like the `redis` case, they suffer from high
 overheads. In practice, the more work an application does the smaller the impact
 of **structural costs** become.
 
diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md
index 1dec37bd1..fc997d40c 100644
--- a/g3doc/architecture_guide/resources.md
+++ b/g3doc/architecture_guide/resources.md
@@ -19,12 +19,12 @@ sandboxed process:
 
 Much like a Virtual Machine (VM), a gVisor sandbox appears as an opaque process
 on the system. Processes within the sandbox do not manifest as processes on the
-host system, and process-level interactions within the sandbox requires entering
+host system, and process-level interactions within the sandbox require entering
 the sandbox (e.g. via a [Docker exec][exec]).
 
 ## Networking
 
-The sandbox attaches a network endpoint to the system, but runs it's own network
+The sandbox attaches a network endpoint to the system, but runs its own network
 stack. All network resources, other than packets in flight on the host, exist
 only inside the sandbox, bound by relevant resource limits.
 
diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md
index b99b86332..9363d834c 100644
--- a/g3doc/architecture_guide/security.md
+++ b/g3doc/architecture_guide/security.md
@@ -104,7 +104,7 @@ interactions with a guest operating system and a set of virtualized hardware
 devices. These hardware devices are then implemented via the host System API by
 a Virtual Machine Monitor (VMM). The Sentry similarly prevents direct
 interactions by providing its own implementation of the System API that the
-application must interact with. Applications are not able to to directly craft
+application must interact with. Applications are not able to directly craft
 specific arguments or flags for the host System API, or interact directly with
 host primitives.
 
diff --git a/g3doc/style.md b/g3doc/style.md
index d10549fe9..8258b0233 100644
--- a/g3doc/style.md
+++ b/g3doc/style.md
@@ -46,6 +46,15 @@ protected.
 Each field or variable protected by a mutex should state as such in a comment on
 the field or variable declaration.
 
+### Function comments
+
+Functions with special entry conditions (e.g., a lock must be held) should state
+these conditions in a `Preconditions:` comment block. One condition per line;
+multiple conditions are specified with a bullet (`*`).
+
+Functions with notable exit conditions (e.g., a `Done` function must eventually
+be called by the caller) can similarly have a `Postconditions:` block.
+
 ### Unused returns
 
 Unused returns should be explicitly ignored with underscores. If there is a
diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md
index 89df65e99..69033357c 100644
--- a/g3doc/user_guide/FAQ.md
+++ b/g3doc/user_guide/FAQ.md
@@ -74,11 +74,10 @@ directories.
 
 ### I'm getting an error like: `panic: unable to attach: operation not permitted` or `fork/exec /proc/self/exe: invalid argument: unknown` {#runsc-perms}
 
-Make sure that permissions and the owner is correct on the `runsc` binary.
+Make sure that permissions is correct on the `runsc` binary.
 
 ```bash
-sudo chown root:root /usr/local/bin/runsc
-sudo chmod 0755 /usr/local/bin/runsc
+sudo chmod a+rx /usr/local/bin/runsc
 ```
 
 ### I'm getting an error like `mount submount "/etc/hostname": creating mount with source ".../hostname": input/output error: unknown.` {#memlock}
@@ -96,6 +95,30 @@ containerd.
 
 See [issue #1765](https://gvisor.dev/issue/1765) for more details.
 
+### I'm getting an error like `RuntimeHandler "runsc" not supported` {#runtime-handler}
+
+This error indicates that the Kubernetes CRI runtime was not set up to handle
+`runsc` as a runtime handler. Please ensure that containerd configuration has
+been created properly and containerd has been restarted. See the
+[containerd quick start](containerd/quick_start.md) for more details.
+
+If you have ensured that containerd has been set up properly and you used
+kubeadm to create your cluster please check if Docker is also installed on that
+system. Kubeadm prefers using Docker if both Docker and containerd are
+installed.
+
+Please recreate your cluster and set the `--cni-socket` option on kubeadm
+commands. For example:
+
+```bash
+kubeadm init --cni-socket=/var/run/containerd/containerd.sock ...
+```
+
+To fix an existing cluster edit the `/var/lib/kubelet/kubeadm-flags.env` file
+and set the `--container-runtime` flag to `remote` and set the
+`--container-runtime-endpoint` flag to point to the containerd socket. e.g.
+`/var/run/containerd/containerd.sock`.
+
 ### My container cannot resolve another container's name when using Docker user defined bridge {#docker-bridge}
 
 This is normally indicated by errors like `bad address 'container-name'` when
diff --git a/g3doc/user_guide/containerd/quick_start.md b/g3doc/user_guide/containerd/quick_start.md
index 2f67eecb3..a98fe5c4a 100644
--- a/g3doc/user_guide/containerd/quick_start.md
+++ b/g3doc/user_guide/containerd/quick_start.md
@@ -1,7 +1,10 @@
 # Containerd Quick Start
 
-This document describes how to install and configure `containerd-shim-runsc-v1`
-using the containerd runtime handler support on `containerd` 1.2 or later.
+This document describes how to use `containerd-shim-runsc-v1` with the
+containerd runtime handler support on `containerd` 1.2 or later.
+
+> ⚠️ NOTE: If you are using Kubernetes and set up your cluster using kubeadm you
+> may run into issues. See the [FAQ](../FAQ.md#runtime-handler) for details.
 
 ## Requirements
 
diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md
index 9afdd264d..c3ced9d61 100644
--- a/g3doc/user_guide/install.md
+++ b/g3doc/user_guide/install.md
@@ -5,6 +5,72 @@
 > Note: gVisor supports only x86\_64 and requires Linux 4.14.77+
 > ([older Linux](./networking.md#gso)).
 
+## Install latest release {#install-latest}
+
+To download and install the latest release manually follow these steps:
+
+```bash
+(
+  set -e
+  URL=https://storage.googleapis.com/gvisor/releases/release/latest
+  wget ${URL}/runsc ${URL}/runsc.sha512 \
+    ${URL}/gvisor-containerd-shim ${URL}/gvisor-containerd-shim.sha512 \
+    ${URL}/containerd-shim-runsc-v1 ${URL}/containerd-shim-runsc-v1.sha512
+  sha512sum -c runsc.sha512 \
+    -c gvisor-containerd-shim.sha512 \
+    -c containerd-shim-runsc-v1.sha512
+  rm -f *.sha512
+  chmod a+rx runsc gvisor-containerd-shim containerd-shim-runsc-v1
+  sudo mv runsc gvisor-containerd-shim containerd-shim-runsc-v1 /usr/local/bin
+)
+```
+
+To install gVisor as a Docker runtime, run the following commands:
+
+```bash
+/usr/local/bin/runsc install
+sudo systemctl restart docker
+docker run --rm --runtime=runsc hello-world
+```
+
+For more details about using gVisor with Docker, see
+[Docker Quick Start](./quick_start/docker.md)
+
+Note: It is important to copy `runsc` to a location that is readable and
+executable to all users, since `runsc` executes itself as user `nobody` to avoid
+unnecessary privileges. The `/usr/local/bin` directory is a good place to put
+the `runsc` binary.
+
+## Install from an `apt` repository
+
+First, appropriate dependencies must be installed to allow `apt` to install
+packages via https:
+
+```bash
+sudo apt-get update && \
+sudo apt-get install -y \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    gnupg-agent \
+    software-properties-common
+```
+
+Next, the configure the key used to sign archives and the repository:
+
+```bash
+curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
+```
+
+Now the runsc package can be installed:
+
+```bash
+sudo apt-get update && sudo apt-get install -y runsc
+```
+
+If you have Docker installed, it will be automatically configured.
+
 ## Versions
 
 The `runsc` binaries and repositories are available in multiple versions and
@@ -21,12 +87,16 @@ Binaries are available for every commit on the `master` branch, and are
 available at the following URL:
 
 `https://storage.googleapis.com/gvisor/releases/master/latest/runsc`
+`https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512`
 
-Checksums for the release binary are at:
+You can use this link with the steps described in
+[Install latest release](#install-latest).
 
-`https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512`
+For `apt` installation, use the `master` to configure the repository:
 
-For `apt` installation, use the `master` as the `${DIST}` below.
+```bash
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases master main"
+```
 
 ### Nightly
 
@@ -34,18 +104,22 @@ Nightly releases are built most nights from the master branch, and are available
 at the following URL:
 
 `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc`
-
-Checksums for the release binary are at:
-
 `https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc.sha512`
 
+You can use this link with the steps described in
+[Install latest release](#install-latest).
+
 Specific nightly releases can be found at:
 
 `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc`
 
 Note that a release may not be available for every day.
 
-For `apt` installation, use the `nightly` as the `${DIST}` below.
+For `apt` installation, use the `nightly` to configure the repository:
+
+```bash
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases nightly main"
+```
 
 ### Latest release
 
@@ -53,105 +127,48 @@ The latest official release is available at the following URL:
 
 `https://storage.googleapis.com/gvisor/releases/release/latest`
 
-For `apt` installation, use the `release` as the `${DIST}` below.
-
-### Specific release
-
-A given release release is available at the following URL:
-
-`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}`
-
-See the [releases][releases] page for information about specific releases.
-
-For `apt` installation of a specific release, which may include point updates,
-use the date of the release, e.g. `${yyyymmdd}`, as the `${DIST}` below.
-
-> Note: only newer releases may be available as `apt` repositories.
-
-### Point release
-
-A given point release is available at the following URL:
-
-`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}`
+You can use this link with the steps described in
+[Install latest release](#install-latest).
 
-Note that `apt` installation of a specific point release is not supported.
-
-## Install from an `apt` repository
-
-First, appropriate dependencies must be installed to allow `apt` to install
-packages via https:
+For `apt` installation, use the `release` to configure the repository:
 
 ```bash
-sudo apt-get update && \
-sudo apt-get install -y \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    gnupg-agent \
-    software-properties-common
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
 ```
 
-Next, the key used to sign archives should be added to your `apt` keychain:
-
-```bash
-curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
-```
+### Specific release
 
-Based on the release type, you will need to substitute `${DIST}` below, using
-one of:
+A given release release is available at the following URL:
 
-*   `master`: For HEAD.
-*   `nightly`: For nightly releases.
-*   `release`: For the latest release.
-*   `${yyyymmdd}`: For a specific releases (see above).
+`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}`
 
-The repository for the release you wish to install should be added:
+You can use this link with the steps described in
+[Install latest release](#install-latest).
 
-```bash
-sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases ${DIST} main"
-```
+See the [releases](https://github.com/google/gvisor/releases) page for
+information about specific releases.
 
-For example, to install the latest official release, you can use:
+For `apt` installation of a specific release, which may include point updates,
+use the date of the release for repository, e.g. `${yyyymmdd}`.
 
 ```bash
-sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases yyyymmdd main"
 ```
 
-Now the runsc package can be installed:
-
-```bash
-sudo apt-get update && sudo apt-get install -y runsc
-```
+> Note: only newer releases may be available as `apt` repositories.
 
-If you have Docker installed, it will be automatically configured.
+### Point release
 
-## Install directly
+A given point release is available at the following URL:
 
-The binary URLs provided above can be used to install directly. For example, the
-latest nightly binary can be downloaded, validated, and placed in an appropriate
-location by running:
+`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}`
 
-```bash
-(
-  set -e
-  URL=https://storage.googleapis.com/gvisor/releases/nightly/latest
-  wget ${URL}/runsc
-  wget ${URL}/runsc.sha512
-  sha512sum -c runsc.sha512
-  rm -f runsc.sha512
-  sudo mv runsc /usr/local/bin
-  sudo chown root:root /usr/local/bin/runsc
-  sudo chmod 0755 /usr/local/bin/runsc
-)
-```
+You can use this link with the steps described in
+[Install latest release](#install-latest).
 
-**It is important to copy this binary to a location that is accessible to all
-users, and ensure it is executable by all users**, since `runsc` executes itself
-as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory
-is a good place to put the `runsc` binary.
+Note that `apt` installation of a specific point release is not supported.
 
 After installation, try out `runsc` by following the
-[Docker Quick Start](./quick_start/docker.md) or
+[Docker Quick Start](./quick_start/docker.md),
+[Containerd QuickStart](./containerd/quick_start.md), or
 [OCI Quick Start](./quick_start/oci.md).
-
-[releases]: https://github.com/google/gvisor/releases
diff --git a/g3doc/user_guide/networking.md b/g3doc/user_guide/networking.md
index 4aa394c91..95f675633 100644
--- a/g3doc/user_guide/networking.md
+++ b/g3doc/user_guide/networking.md
@@ -2,9 +2,9 @@
 
 [TOC]
 
-gVisor implements its own network stack called [netstack][netstack]. All aspects
-of the network stack are handled inside the Sentry — including TCP connection
-state, control messages, and packet assembly — keeping it isolated from the host
+gVisor implements its own network stack called netstack. All aspects of the
+network stack are handled inside the Sentry — including TCP connection state,
+control messages, and packet assembly — keeping it isolated from the host
 network stack. Data link layer packets are written directly to the virtual
 device inside the network namespace setup by Docker or Kubernetes.
 
@@ -82,4 +82,3 @@ Offload (GSO) to run with a kernel that is newer than 3.17. Add the
 }
 ```
 
-[netstack]: https://github.com/google/netstack
diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md
index 6ad594ecc..ee842e453 100644
--- a/g3doc/user_guide/quick_start/docker.md
+++ b/g3doc/user_guide/quick_start/docker.md
@@ -22,18 +22,6 @@ named "runsc" by default.
 sudo runsc install
 ```
 
-You may also wish to install a runtime entry for debugging. The `runsc install`
-command can accept options that will be passed to the runtime when it is invoked
-by Docker.
-
-```bash
-sudo runsc install --runtime runsc-debug -- \
-  --debug \
-  --debug-log=/tmp/runsc-debug.log \
-  --strace \
-  --log-packets
-```
-
 You must restart the Docker daemon after installing the runtime. Typically this
 is done via `systemd`:
 
@@ -85,6 +73,21 @@ $ docker run --runtime=runsc -it ubuntu dmesg
 Note that this is easily replicated by an attacker so applications should never
 use `dmesg` to verify the runtime in a security sensitive context.
 
+## Options
+
+You may also wish to install a runtime entry with different options. The `runsc
+install` command can accept flags that will be passed to the runtime when it is
+invoked by Docker. For example, to install a runtime with debugging enabled, run
+the following:
+
+```bash
+sudo runsc install --runtime runsc-debug -- \
+  --debug \
+  --debug-log=/tmp/runsc-debug.log \
+  --strace \
+  --log-packets
+```
+
 Next, look at the different options available for gVisor: [platform][platforms],
 [network][networking], [filesystem][filesystem].
 
diff --git a/g3doc/user_guide/tutorials/BUILD b/g3doc/user_guide/tutorials/BUILD
index 405026a33..f405349b3 100644
--- a/g3doc/user_guide/tutorials/BUILD
+++ b/g3doc/user_guide/tutorials/BUILD
@@ -15,6 +15,15 @@ doc(
 )
 
 doc(
+    name = "docker_compose",
+    src = "docker-compose.md",
+    category = "User Guide",
+    permalink = "/docs/tutorials/docker-compose/",
+    subcategory = "Tutorials",
+    weight = "20",
+)
+
+doc(
     name = "kubernetes",
     src = "kubernetes.md",
     category = "User Guide",
@@ -24,7 +33,7 @@ doc(
     ],
     permalink = "/docs/tutorials/kubernetes/",
     subcategory = "Tutorials",
-    weight = "20",
+    weight = "30",
 )
 
 doc(
@@ -33,5 +42,5 @@ doc(
     category = "User Guide",
     permalink = "/docs/tutorials/cni/",
     subcategory = "Tutorials",
-    weight = "30",
+    weight = "40",
 )
diff --git a/g3doc/user_guide/tutorials/cni.md b/g3doc/user_guide/tutorials/cni.md
index ce2fd09a8..a3507c25b 100644
--- a/g3doc/user_guide/tutorials/cni.md
+++ b/g3doc/user_guide/tutorials/cni.md
@@ -47,7 +47,7 @@ sudo mkdir -p /etc/cni/net.d
 
 sudo sh -c 'cat > /etc/cni/net.d/10-bridge.conf << EOF
 {
-  "cniVersion": "0.4.0",
+  "cniVersion": "0.3.1",
   "name": "mynet",
   "type": "bridge",
   "bridge": "cni0",
@@ -65,7 +65,7 @@ EOF'
 
 sudo sh -c 'cat > /etc/cni/net.d/99-loopback.conf << EOF
 {
-  "cniVersion": "0.4.0",
+  "cniVersion": "0.3.1",
   "name": "lo",
   "type": "loopback"
 }
diff --git a/g3doc/user_guide/tutorials/docker-compose.md b/g3doc/user_guide/tutorials/docker-compose.md
new file mode 100644
index 000000000..3284231f8
--- /dev/null
+++ b/g3doc/user_guide/tutorials/docker-compose.md
@@ -0,0 +1,100 @@
+# Wordpress with Docker Compose
+
+This page shows you how to deploy a sample [WordPress][wordpress] site using
+[Docker Compose][docker-compose].
+
+### Before you begin
+
+[Follow these instructions][docker-install] to install runsc with Docker. This
+document assumes that Docker and Docker Compose are installed and the runtime
+name chosen for gVisor is `runsc`.
+
+### Configuration
+
+We'll start by creating the `docker-compose.yaml` file to specify our services.
+We will specify two services, a `wordpress` service for the Wordpress Apache
+server, and a `db` service for MySQL. We will configure Wordpress to connect to
+MySQL via the `db` service host name.
+
+> **Note:** Docker Compose uses it's own network by default and allows services
+> to communicate using their service name. Docker Compose does this by setting
+> up a DNS server at IP address 127.0.0.11 and configuring containers to use it
+> via [resolv.conf][resolv.conf]. This IP is not addressable inside a gVisor
+> sandbox so it's important that we set the DNS IP address to the alternative
+> `8.8.8.8` and use a network that allows routing to it. See
+> [Networking in Compose][compose-networking] for more details.
+
+> **Note:** The `runtime` field was removed from services in the 3.x version of
+> the API in versions of docker-compose < 1.27.0. You will need to write your
+> `docker-compose.yaml` file using the 2.x format or use docker-compose >=
+> 1.27.0. See this [issue](https://github.com/docker/compose/issues/6239) for
+> more details.
+
+```yaml
+version: '2.3'
+
+services:
+   db:
+     image: mysql:5.7
+     volumes:
+       - db_data:/var/lib/mysql
+     restart: always
+     environment:
+       MYSQL_ROOT_PASSWORD: somewordpress
+       MYSQL_DATABASE: wordpress
+       MYSQL_USER: wordpress
+       MYSQL_PASSWORD: wordpress
+     # All services must be on the same network to communicate.
+     network_mode: "bridge"
+
+   wordpress:
+     depends_on:
+       - db
+     # When using the "bridge" network specify links.
+     links:
+       - db
+     image: wordpress:latest
+     ports:
+       - "8080:80"
+     restart: always
+     environment:
+       WORDPRESS_DB_HOST: db:3306
+       WORDPRESS_DB_USER: wordpress
+       WORDPRESS_DB_PASSWORD: wordpress
+       WORDPRESS_DB_NAME: wordpress
+     # Specify the dns address if needed.
+     dns:
+       - 8.8.8.8
+     # All services must be on the same network to communicate.
+     network_mode: "bridge"
+     # Specify the runtime used by Docker. Must be set up in
+     #  /etc/docker/daemon.json.
+     runtime: "runsc"
+
+volumes:
+    db_data: {}
+```
+
+Once you have a `docker-compose.yaml` in the current directory you can start the
+containers:
+
+```bash
+docker-compose up
+```
+
+Once the containers have started you can access wordpress at
+http://localhost:8080.
+
+Congrats! You now how a working wordpress site up and running using Docker
+Compose.
+
+### What's next
+
+Learn how to deploy [WordPress with Kubernetes][wordpress-k8s].
+
+[docker-compose]: https://docs.docker.com/compose/
+[docker-install]: ../quick_start/docker.md
+[wordpress]: https://wordpress.com/
+[resolv.conf]: https://man7.org/linux/man-pages/man5/resolv.conf.5.html
+[wordpress-k8s]: kubernetes.md
+[compose-networking]: https://docs.docker.com/compose/networking/
diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md
index 705560038..9ca01da2a 100644
--- a/g3doc/user_guide/tutorials/docker.md
+++ b/g3doc/user_guide/tutorials/docker.md
@@ -60,9 +60,11 @@ Congratulations! You have just deployed a WordPress site using Docker.
 
 ### What's next
 
-[Learn how to deploy WordPress with Kubernetes][wordpress-k8s].
+Learn how to deploy WordPress with [Kubernetes][wordpress-k8s] or
+[Docker Compose][wordpress-compose].
 
 [docker]: https://www.docker.com/
-[docker-install]: /docs/user_guide/quick_start/docker/
+[docker-install]: ../quick_start/docker.md
 [wordpress]: https://wordpress.com/
-[wordpress-k8s]: /docs/tutorials/kubernetes/
+[wordpress-k8s]: kubernetes.md
+[wordpress-compose]: docker-compose.md
diff --git a/g3doc/user_guide/tutorials/kubernetes.md b/g3doc/user_guide/tutorials/kubernetes.md
index d2a94b1b7..1ec6e71e9 100644
--- a/g3doc/user_guide/tutorials/kubernetes.md
+++ b/g3doc/user_guide/tutorials/kubernetes.md
@@ -23,12 +23,12 @@ gcloud beta container node-pools create sandbox-pool --cluster=${CLUSTER_NAME} -
 If you prefer to use the console, select your cluster and select the **ADD NODE
 POOL** button:
 
-![+ ADD NODE POOL](./node-pool-button.png)
+![+ ADD NODE POOL](node-pool-button.png)
 
 Then select the **Image type** with **Containerd** and select **Enable sandbox
 with gVisor** option. Select other options as you like:
 
-![+ NODE POOL](./add-node-pool.png)
+![+ NODE POOL](add-node-pool.png)
 
 ### Check that gVisor is enabled
 
@@ -57,47 +57,149 @@ curl -LO https://k8s.io/examples/application/wordpress/mysql-deployment.yaml
 Add a **spec.template.spec.runtimeClassName** set to **gvisor** to both files,
 as shown below:
 
-**wordpress-deployment.yaml:** ```yaml apiVersion: v1 kind: Service metadata:
-name: wordpress labels: app: wordpress spec: ports: - port: 80 selector: app:
-wordpress tier: frontend
-
-## type: LoadBalancer
-
-apiVersion: v1 kind: PersistentVolumeClaim metadata: name: wp-pv-claim labels:
-app: wordpress spec: accessModes: - ReadWriteOnce resources: requests:
-
-## storage: 20Gi
-
-apiVersion: apps/v1 kind: Deployment metadata: name: wordpress labels: app:
-wordpress spec: selector: matchLabels: app: wordpress tier: frontend strategy:
-type: Recreate template: metadata: labels: app: wordpress tier: frontend spec:
-runtimeClassName: gvisor # ADD THIS LINE containers: - image:
-wordpress:4.8-apache name: wordpress env: - name: WORDPRESS_DB_HOST value:
-wordpress-mysql - name: WORDPRESS_DB_PASSWORD valueFrom: secretKeyRef: name:
-mysql-pass key: password ports: - containerPort: 80 name: wordpress
-volumeMounts: - name: wordpress-persistent-storage mountPath: /var/www/html
-volumes: - name: wordpress-persistent-storage persistentVolumeClaim: claimName:
-wp-pv-claim ```
-
-**mysql-deployment.yaml:** ```yaml apiVersion: v1 kind: Service metadata: name:
-wordpress-mysql labels: app: wordpress spec: ports: - port: 3306 selector: app:
-wordpress tier: mysql
-
-## clusterIP: None
-
-apiVersion: v1 kind: PersistentVolumeClaim metadata: name: mysql-pv-claim
-labels: app: wordpress spec: accessModes: - ReadWriteOnce resources: requests:
-
-## storage: 20Gi
+**wordpress-deployment.yaml:**
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: wordpress
+  labels:
+    app: wordpress
+spec:
+  ports:
+    - port: 80
+  selector:
+    app: wordpress
+    tier: frontend
+  type: LoadBalancer
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: wp-pv-claim
+  labels:
+    app: wordpress
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: wordpress
+  labels:
+    app: wordpress
+spec:
+  selector:
+    matchLabels:
+      app: wordpress
+      tier: frontend
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: wordpress
+        tier: frontend
+    spec:
+      runtimeClassName: gvisor   # ADD THIS LINE
+      containers:
+      - image: wordpress:4.8-apache
+        name: wordpress
+        env:
+        - name: WORDPRESS_DB_HOST
+          value: wordpress-mysql
+        - name: WORDPRESS_DB_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: mysql-pass
+              key: password
+        ports:
+        - containerPort: 80
+          name: wordpress
+        volumeMounts:
+        - name: wordpress-persistent-storage
+          mountPath: /var/www/html
+      volumes:
+      - name: wordpress-persistent-storage
+        persistentVolumeClaim:
+          claimName: wp-pv-claim
+```
 
-apiVersion: apps/v1 kind: Deployment metadata: name: wordpress-mysql labels:
-app: wordpress spec: selector: matchLabels: app: wordpress tier: mysql strategy:
-type: Recreate template: metadata: labels: app: wordpress tier: mysql spec:
-runtimeClassName: gvisor # ADD THIS LINE containers: - image: mysql:5.6 name:
-mysql env: - name: MYSQL_ROOT_PASSWORD valueFrom: secretKeyRef: name: mysql-pass
-key: password ports: - containerPort: 3306 name: mysql volumeMounts: - name:
-mysql-persistent-storage mountPath: /var/lib/mysql volumes: - name:
-mysql-persistent-storage persistentVolumeClaim: claimName: mysql-pv-claim ```
+**mysql-deployment.yaml:**
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: wordpress-mysql
+  labels:
+    app: wordpress
+spec:
+  ports:
+    - port: 3306
+  selector:
+    app: wordpress
+    tier: mysql
+  clusterIP: None
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: mysql-pv-claim
+  labels:
+    app: wordpress
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: wordpress-mysql
+  labels:
+    app: wordpress
+spec:
+  selector:
+    matchLabels:
+      app: wordpress
+      tier: mysql
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: wordpress
+        tier: mysql
+    spec:
+      runtimeClassName: gvisor   # ADD THIS LINE
+      containers:
+      - image: mysql:5.6
+        name: mysql
+        env:
+        - name: MYSQL_ROOT_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: mysql-pass
+              key: password
+        ports:
+        - containerPort: 3306
+          name: mysql
+        volumeMounts:
+        - name: mysql-persistent-storage
+          mountPath: /var/lib/mysql
+      volumes:
+      - name: mysql-persistent-storage
+        persistentVolumeClaim:
+          claimName: mysql-pv-claim
+```
 
 Note that apart from `runtimeClassName: gvisor`, nothing else about the
 Deployment has is changed.
diff --git a/go.mod b/go.mod
index 2fcba5cc9..e6df99177 100644
--- a/go.mod
+++ b/go.mod
@@ -1,52 +1,54 @@
 module gvisor.dev/gvisor
 
-go 1.14
+go 1.15
 
 replace github.com/Sirupsen/logrus => github.com/sirupsen/logrus v1.6.0
 
 require (
 	cloud.google.com/go v0.52.1-0.20200122224058-0482b626c726 // indirect
-	github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5 // indirect
+	github.com/Microsoft/go-winio v0.4.15-0.20200908182639-5b44b70ab3ab // indirect
 	github.com/Microsoft/hcsshim v0.8.6 // indirect
 	github.com/cenkalti/backoff v1.1.1-0.20190506075156-2146c9339422 // indirect
+	github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3 // indirect
 	github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41 // indirect
 	github.com/containerd/containerd v1.3.4 // indirect
-	github.com/containerd/continuity v0.0.0-20200710164510-efbc4488d8fe // indirect
+	github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a // indirect
 	github.com/containerd/fifo v0.0.0-20191213151349-ff969a566b00 // indirect
 	github.com/containerd/go-runc v0.0.0-20200220073739-7016d3ce2328 // indirect
 	github.com/containerd/ttrpc v0.0.0-20200121165050-0be804eadb15 // indirect
 	github.com/containerd/typeurl v0.0.0-20200205145503-b45ef1f1f737 // indirect
 	github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf // indirect
+	github.com/coreos/go-systemd/v22 v22.0.0 // indirect
+	github.com/cpuguy83/go-md2man/v2 v2.0.0 // indirect
 	github.com/docker/distribution v2.7.1-0.20190205005809-0d3efadf0154+incompatible // indirect
 	github.com/docker/docker v1.4.2-0.20191028175130-9e7d5ac5ea55 // indirect
 	github.com/docker/go-connections v0.3.0 // indirect
 	github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
 	github.com/docker/go-units v0.4.0 // indirect
-	github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b // indirect
+	github.com/dpjacques/clockwork v0.1.1-0.20200827220843-c1f524b839be // indirect
 	github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e // indirect
 	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 // indirect
 	github.com/gogo/googleapis v1.4.0 // indirect
-	github.com/golang/protobuf v1.4.2 // indirect
-	github.com/google/go-cmp v0.5.0 // indirect
+	github.com/google/go-cmp v0.5.1 // indirect
 	github.com/google/go-github/v28 v28.1.2-0.20191108005307-e555eab49ce8 // indirect
 	github.com/google/subcommands v1.0.2-0.20190508160503-636abe8753b8 // indirect
 	github.com/hashicorp/go-multierror v1.0.0 // indirect
 	github.com/kr/pty v1.1.4-0.20190131011033-7dc38fb350b1 // indirect
 	github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9 // indirect
-	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.0.1 // indirect
 	github.com/opencontainers/runc v0.1.1 // indirect
 	github.com/opencontainers/runtime-spec v1.0.2-0.20181111125026-1722abf79c2f // indirect
 	github.com/pborman/uuid v1.2.0 // indirect
 	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 // indirect
-	github.com/urfave/cli v0.0.0-20171014202726-7bc6a0acffa5 // indirect
+	github.com/urfave/cli v1.22.2 // indirect
 	github.com/vishvananda/netlink v1.0.1-0.20190930145447-2ec5bdc52b86 // indirect
-	github.com/vishvananda/netns v0.0.0-20200520041808-52d707b772fe // indirect
-	go.uber.org/atomic v1.6.0 // indirect
+	github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae // indirect
+	go.uber.org/atomic v1.7.0 // indirect
 	go.uber.org/multierr v1.2.0 // indirect
 	golang.org/x/time v0.0.0-20191024005414-555d28b269f0 // indirect
-	golang.org/x/tools v0.0.0-20200707200213-416e8f4faf8a // indirect
+	golang.org/x/tools v0.0.0-20201002184944-ecd9fd270d5d // indirect
 	google.golang.org/grpc v1.29.0 // indirect
+	google.golang.org/protobuf v1.25.1-0.20200808011614-a180de9f97d9 // indirect
 	gopkg.in/yaml.v2 v2.2.8 // indirect
 	gotest.tools v2.2.0+incompatible // indirect
 )
diff --git a/go.sum b/go.sum
index f98132971..e713d2eaa 100644
--- a/go.sum
+++ b/go.sum
@@ -18,13 +18,15 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/Microsoft/go-winio v0.4.14 h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU=
 github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
-github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5 h1:ygIc8M6trr62pF5DucadTWGdEB4mEyvzi0e2nbcmcyA=
 github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
+github.com/Microsoft/go-winio v0.4.15-0.20200908182639-5b44b70ab3ab h1:9pygWVFqbY9lPxM0peffumuVDyMuIMzNLyO9uFjJuQo=
+github.com/Microsoft/go-winio v0.4.15-0.20200908182639-5b44b70ab3ab/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
 github.com/Microsoft/hcsshim v0.8.6/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg=
 github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ=
 github.com/Microsoft/hcsshim v0.8.8/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg38RRsjT5y8=
-github.com/Microsoft/hcsshim v0.8.9 h1:VrfodqvztU8YSOvygU+DN1BGaSGxmrNfqOv5oOuX2Bk=
 github.com/Microsoft/hcsshim v0.8.9/go.mod h1:5692vkUqntj1idxauYlpoINNKeqCiG6Sg38RRsjT5y8=
+github.com/Microsoft/hcsshim v0.8.10 h1:k5wTrpnVU2/xv8ZuzGkbXVd3js5zJ8RnumPo5RxiIxU=
+github.com/Microsoft/hcsshim v0.8.10/go.mod h1:g5uw8EV2mAlzqe94tfNBNdr89fnbD/n3HV0OhsddkmM=
 github.com/blang/semver v3.1.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
 github.com/cenkalti/backoff v1.1.1-0.20190506075156-2146c9339422 h1:8eZxmY1yvxGHzdzTEhI09npjMVGzNAdrqzruTX6jcK4=
 github.com/cenkalti/backoff v1.1.1-0.20190506075156-2146c9339422/go.mod h1:b6Nc7NRH5C4aCISLry0tLnTjcuTEvoiqcWDdsU0sOGM=
@@ -32,12 +34,13 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
 github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
 github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
+github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmEg9bt0VpxxWqJlO4iwu3FBdHUzV7wQVg=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41 h1:5yg0k8gqOssNLsjjCtXIADoPbAtUtQZJfC8hQ4r2oFY=
 github.com/containerd/cgroups v0.0.0-20181219155423-39b18af02c41/go.mod h1:X9rLEHIqSf/wfK8NsPqxJmeZgW4pcfzdXITDrUSJ6uI=
-github.com/containerd/cgroups v0.0.0-20190919134610-bf292b21730f h1:tSNMc+rJDfmYntojat8lljbt1mgKNpTxUZJsSzJ9Y1s=
-github.com/containerd/cgroups v0.0.0-20190919134610-bf292b21730f/go.mod h1:OApqhQ4XNSNC13gXIwDjhOQxjWa/NxkwZXJ1EvqT0ko=
+github.com/containerd/cgroups v0.0.0-20200531161412-0dbf7f05ba59 h1:qWj4qVYZ95vLWwqyNJCQg7rDsG5wPdze0UaPolH7DUk=
+github.com/containerd/cgroups v0.0.0-20200531161412-0dbf7f05ba59/go.mod h1:pA0z1pT8KYB3TCXK/ocprsh7MAkoW8bZVzPdih9snmM=
 github.com/containerd/console v0.0.0-20180822173158-c12b1e7919c1/go.mod h1:Tj/on1eG8kiEhd0+fhSDzsPAFESxzBBvdyEgyryXffw=
 github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e h1:GdiIYd8ZDOrT++e1NjhSD4rGt9zaJukHm4rt5F4mRQc=
 github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e/go.mod h1:8Pf4gM6VEbTNRIT26AyyU7hxdQU3MvAvxVI0sc00XBE=
@@ -45,8 +48,8 @@ github.com/containerd/containerd v1.3.2/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMX
 github.com/containerd/containerd v1.3.4 h1:3o0smo5SKY7H6AJCmJhsnCjR2/V2T8VmiHt7seN2/kI=
 github.com/containerd/containerd v1.3.4/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA=
 github.com/containerd/continuity v0.0.0-20190426062206-aaeac12a7ffc/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y=
-github.com/containerd/continuity v0.0.0-20200710164510-efbc4488d8fe h1:PEmIrUvwG9Yyv+0WKZqjXfSFDeZjs/q15g0m08BYS9k=
-github.com/containerd/continuity v0.0.0-20200710164510-efbc4488d8fe/go.mod h1:cECdGN1O8G9bgKTlLhuPJimka6Xb/Gg7vYzCTNVxhvo=
+github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a h1:jEIoR0aA5GogXZ8pP3DUzE+zrhaF6/1rYZy+7KkYEWM=
+github.com/containerd/continuity v0.0.0-20200928162600-f2cc35102c2a/go.mod h1:W0qIOTD7mp2He++YVq+kgfXezRYqzP1uDuMVH1bITDY=
 github.com/containerd/fifo v0.0.0-20190226154929-a9fb20d87448/go.mod h1:ODA38xgv3Kuk8dQz2ZQXpnv/UZZUHUCL7pnLehbXgQI=
 github.com/containerd/fifo v0.0.0-20191213151349-ff969a566b00 h1:lsjC5ENBl+Zgf38+B0ymougXFp0BaubeIVETltYZTQw=
 github.com/containerd/fifo v0.0.0-20191213151349-ff969a566b00/go.mod h1:jPQ2IAeZRCYxpS/Cm1495vGFww6ecHmMk1YJH2Q5ln0=
@@ -59,9 +62,12 @@ github.com/containerd/ttrpc v0.0.0-20200121165050-0be804eadb15/go.mod h1:UAxOpgT
 github.com/containerd/typeurl v0.0.0-20180627222232-a93fcdb778cd/go.mod h1:Cm3kwCdlkCfMSHURc+r6fwoGH6/F1hH3S4sg0rLFWPc=
 github.com/containerd/typeurl v0.0.0-20200205145503-b45ef1f1f737 h1:HovfQDS/K3Mr7eyS0QJLxE1CbVUhjZCl6g3OhFJgP1o=
 github.com/containerd/typeurl v0.0.0-20200205145503-b45ef1f1f737/go.mod h1:TB1hUtrpaiO88KEK56ijojHS1+NeF0izUACaJW2mdXg=
-github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
 github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
 github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
+github.com/coreos/go-systemd/v22 v22.0.0 h1:XJIw/+VlJ+87J+doOxznsAWIdmWuViOVhkQamW5YV28=
+github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
+github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
+github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/docker/distribution v2.7.1-0.20190205005809-0d3efadf0154+incompatible h1:dvc1KSkIYTVjZgHf/CTC2diTYC8PzhaA5sFISRfNVrE=
@@ -74,22 +80,22 @@ github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ
 github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA=
 github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
 github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
-github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b h1:7krODee+eIlZYoLiEDmP1kLFNCvd0bQ0eEXOympdN6U=
-github.com/dpjacques/clockwork v0.1.1-0.20190114191937-d864eecc357b/go.mod h1:D8mP2A8vVT2GkXqPorSBmhnshhkFBYgzhA90KmJt25Y=
-github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/dpjacques/clockwork v0.1.1-0.20200827220843-c1f524b839be h1:l+j1wSnHcimOzeeKxtspsl6tCBTyikdYxcWqFZ+Ho2c=
+github.com/dpjacques/clockwork v0.1.1-0.20200827220843-c1f524b839be/go.mod h1:D8mP2A8vVT2GkXqPorSBmhnshhkFBYgzhA90KmJt25Y=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8=
 github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4=
+github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME=
+github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 h1:JFTFz3HZTGmgMz4E1TabNBNJljROSYgja1b4l50FNVs=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
 github.com/gogo/googleapis v1.4.0 h1:zgVt4UpGxcqVOw97aRGxT4svlcmdK35fynLNctY32zI=
 github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c=
-github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
 github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
@@ -109,8 +115,8 @@ github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:x
 github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
 github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
 github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.4.1 h1:ZFgWrT+bLgsYPirOnRfKLYJLvssAegOj/hgyMFdJZe0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
@@ -118,8 +124,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k=
+github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-github/v28 v28.1.2-0.20191108005307-e555eab49ce8 h1:zOOUQavr8D4AZrcV4ylUpbGa5j3jfeslN6Xculz3tVU=
 github.com/google/go-github/v28 v28.1.2-0.20191108005307-e555eab49ce8/go.mod h1:g82e6OHbJ0WYrYeOrid1MMfHAtqjxBz+N74tfAt9KrQ=
 github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
@@ -140,18 +147,18 @@ github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uP
 github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
-github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfEUpgAwUN0o=
 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
-github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/pty v1.1.4-0.20190131011033-7dc38fb350b1 h1:zc0R6cOw98cMengLA0fvU55mqbnN7sd/tBMLzSejp+M=
@@ -159,11 +166,7 @@ github.com/kr/pty v1.1.4-0.20190131011033-7dc38fb350b1/go.mod h1:pFQYn66WHrOpPYN
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9 h1:Sha2bQdoWE5YQPTlJOL31rmce94/tYi113SlFo1xQ2c=
 github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
-github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
 github.com/opencontainers/go-digest v0.0.0-20180430190053-c9281466c8b2/go.mod h1:cMLVZDEM3+U2I4VmLI6N8jQYUd2OVphdqWwCJHrFt2s=
-github.com/opencontainers/go-digest v1.0.0-rc1/go.mod h1:cMLVZDEM3+U2I4VmLI6N8jQYUd2OVphdqWwCJHrFt2s=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
 github.com/opencontainers/image-spec v1.0.1 h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVojFA6h/TRcI=
@@ -171,13 +174,13 @@ github.com/opencontainers/image-spec v1.0.1/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zM
 github.com/opencontainers/runc v0.0.0-20190115041553-12f6a991201f/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
 github.com/opencontainers/runc v0.1.1 h1:GlxAyO6x8rfZYN9Tt0Kti5a/cP41iuiO2yYT0IJGY8Y=
 github.com/opencontainers/runc v0.1.1/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
-github.com/opencontainers/runtime-spec v0.1.2-0.20190507144316-5b71a03e2700/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/opencontainers/runtime-spec v1.0.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/opencontainers/runtime-spec v1.0.2-0.20181111125026-1722abf79c2f h1:Pyp2f/uuhJIcUgnIeZaAbwOcyNz8TBlEe6mPpC8kXq8=
 github.com/opencontainers/runtime-spec v1.0.2-0.20181111125026-1722abf79c2f/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0=
+github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/pborman/uuid v1.2.0 h1:J7Q5mO4ysT1dv8hyrUGHb9+ooztCXu1D8MY8DZYsu3g=
 github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
-github.com/pkg/errors v0.8.1-0.20171018195549-f15c970de5b7/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -186,10 +189,13 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:
 github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.0-20190522114515-bc1a522cf7b1/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
-github.com/sirupsen/logrus v1.0.4-0.20170822132746-89742aefa4b2/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc=
+github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q=
 github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
+github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
 github.com/spf13/cobra v0.0.2-0.20171109065643-2da4a54c5cee/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
 github.com/spf13/pflag v1.0.1-0.20171106142849-4c012f6dcd95/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -199,21 +205,20 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 h1:b6uOv7YOFK0TYG7HtkIgExQo+2RdLuwRft63jn2HWj8=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
-github.com/urfave/cli v0.0.0-20171014202726-7bc6a0acffa5/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
+github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
 github.com/vishvananda/netlink v1.0.1-0.20190930145447-2ec5bdc52b86 h1:7SWt9pGCMaw+N1ZhRsaLKaYNviFhxambdoaoYlDqz1w=
 github.com/vishvananda/netlink v1.0.1-0.20190930145447-2ec5bdc52b86/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
-github.com/vishvananda/netns v0.0.0-20200520041808-52d707b772fe h1:mjAZxE1nh8yvuwhGHpdDqdhtNu2dgbpk93TwoXuk5so=
-github.com/vishvananda/netns v0.0.0-20200520041808-52d707b772fe/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0=
-github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae h1:4hwBBUfQCFe3Cym0ZtKyq7L16eZUtYKs+BaHDN6mAns=
+github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2 h1:75k/FF0Q2YM8QYo07VPddOLBslDt1MZOdEslOHvmzAs=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
-go.uber.org/atomic v1.6.0 h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=
-go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
+go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
+go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
 go.uber.org/multierr v1.2.0 h1:6I+W7f5VwC5SV9dNrZ3qXrDB9mD0dyGOi/ZJmYw03T4=
 go.uber.org/multierr v1.2.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
-golang.org/x/crypto v0.0.0-20171113213409-9f005a07e0d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@@ -233,7 +238,6 @@ golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTk
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f h1:J5lckAjkw6qYlOZNj90mLYNTEKDvWeuc1yieZ8qUzUE=
 golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
 golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
@@ -256,8 +260,8 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200625001655-4c5254603344 h1:vGXIOMxbNfDTk/aXCmfdLgkrSV+Z2tcbze+pEc3v5W4=
-golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20200822124328-c89045814202 h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA=
+golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -273,27 +277,24 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208 h1:qwRHBd0NqMbJxfbotnDhm2By
 golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190514135907-3a4b5fb9f71f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191210023423-ac6580df4449/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200120151820-655fe14d7479/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200217220822-9197077df867/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200523222454-059865788121 h1:rITEj+UZHYC927n8GT97eC3zrpzXdb/voyeOuVKS46o=
-golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
@@ -302,7 +303,6 @@ golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxb
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -319,16 +319,16 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
 golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.0.0-20200707200213-416e8f4faf8a h1:YAl/dx/kLsMMIWGqfhFHW9ckqGhmq7Ki0dfoKAgvFTE=
-golang.org/x/tools v0.0.0-20200707200213-416e8f4faf8a/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
+golang.org/x/tools v0.0.0-20201002184944-ecd9fd270d5d h1:vWQvJ/Z0Lu+9/8oQ/pAYXNzbc7CMnBl+tULGVHOy3oE=
+golang.org/x/tools v0.0.0-20201002184944-ecd9fd270d5d/go.mod h1:z6u4i615ZeAfBE4XtMziQW1fSVJXACjjbWkB/mvPzlU=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
@@ -349,8 +349,9 @@ google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
 google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
 google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
-google.golang.org/genproto v0.0.0-20200117163144-32f20d992d24 h1:wDju+RU97qa0FZT0QnZDg9Uc2dH0Ql513kFvHocz+WM=
 google.golang.org/genproto v0.0.0-20200117163144-32f20d992d24/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
 google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
 google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
@@ -358,6 +359,7 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac
 google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
 google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
 google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
 google.golang.org/grpc v1.29.0 h1:2pJjwYOdkZ9HlN4sWRYBg9ttH5bCOlsueaM+b/oYjwo=
 google.golang.org/grpc v1.29.0/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
@@ -365,16 +367,13 @@ google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
 google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
 google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.25.1-0.20200808011614-a180de9f97d9 h1:poC0iCcx0QXFYlS6nuq/8K+Ng5T55k0FXdzq52hVi4w=
+google.golang.org/protobuf v1.25.1-0.20200808011614-a180de9f97d9/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
-gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
-gopkg.in/gemnasium/logrus-airbrake-hook.v2 v2.1.2/go.mod h1:Xk6kEKp8OKb+X14hQBKWaSkCsqBpgog8nAV2xsGOxlo=
-gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
-gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
diff --git a/images/Makefile b/images/Makefile
index 278dec02f..12927c509 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -23,7 +23,7 @@ ARCH := $(shell uname -m)
 # tests are using locally-defined images (that are consistent and idempotent).
 REMOTE_IMAGE_PREFIX ?= gcr.io/gvisor-presubmit
 LOCAL_IMAGE_PREFIX ?= gvisor.dev/images
-ALL_IMAGES := $(subst /,_,$(subst ./,,$(shell find . -name Dockerfile -exec dirname {} \;)))
+ALL_IMAGES := $(subst /,_,$(subst ./,,$(shell find . -name Dockerfile -o -name Dockerfile.$(ARCH) | xargs -n 1 dirname | uniq)))
 ifneq ($(ARCH),$(shell uname -m))
 DOCKER_PLATFORM_ARGS := --platform=$(ARCH)
 else
@@ -51,6 +51,7 @@ load-%-images:
 # ensuring that images will always be sourced using the local files if there
 # are changes.
 path = $(subst _,/,$(1))
+dockerfile = $$(if [ -f "$(call path,$(1))/Dockerfile.$(ARCH)" ]; then echo Dockerfile.$(ARCH); else echo Dockerfile; fi)
 tag = $(shell find $(call path,$(1)) -type f -print | sort | xargs -n 1 sha256sum | sha256sum - | cut -c 1-16)
 remote_image = $(REMOTE_IMAGE_PREFIX)/$(subst _,/,$(1))_$(ARCH):$(call tag,$(1))
 local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1))
@@ -59,11 +60,17 @@ local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1))
 # we need to explicitly repull the base layer in order to ensure that the
 # architecture is correct. Note that we use the term "rebuild" here to avoid
 # conflicting with the bazel "build" terminology, which is used elsewhere.
-rebuild-%: FROM=$(shell grep FROM $(call path,$*)/Dockerfile } cut -d' ' -f2)
+rebuild-%: FROM=$(shell grep FROM "$(call path,$*)/$(call dockerfile,$*)" | cut -d' ' -f2)
 rebuild-%: register-cross
-	$(foreach IMAGE,$(FROM),docker $(DOCKER_PLATFORM_ARGS) $(IMAGE); &&) true
+	@if ! [ -f "$(call path,$*)/$(call dockerfile,$*)" ]; then \
+		(echo "ERROR: Dockerfile for $* not found (is it available for $(ARCH)?)." >&2 && exit 1); \
+	fi
+	$(foreach IMAGE,$(FROM),docker pull $(DOCKER_PLATFORM_ARGS) $(IMAGE) &&) \
 	T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \
-		docker build $(DOCKER_PLATFORM_ARGS) -t $(call remote_image,$*) $$T && \
+		docker build $(DOCKER_PLATFORM_ARGS) \
+			-f "$$T/$(call dockerfile,$*)" \
+			-t "$(call remote_image,$*)" \
+			$$T && \
 		rm -rf $$T
 
 # pull will check the "remote" image and pull if necessary. If the remote image
@@ -73,10 +80,10 @@ pull-%:
 	docker pull $(DOCKER_PLATFORM_ARGS) $(call remote_image,$*)
 
 # load will either pull the "remote" or build it locally. This is the preferred
-# entrypoint, as it should never file. The local tag should always be set after
+# entrypoint, as it should never fail. The local tag should always be set after
 # this returns (either by the pull or the build).
 load-%:
-	docker inspect $(call remote_image,$*) >/dev/null 2>&1 || $(MAKE) pull-$* || $(MAKE) rebuild-$*
+	$(MAKE) pull-$* || $(MAKE) rebuild-$*
 	docker tag $(call remote_image,$*) $(call local_image,$*)
 
 # push pushes the remote image, after either pulling (to validate that the tag
diff --git a/images/basic/mysql/Dockerfile b/images/basic/mysql/Dockerfile
index 95da9c48d..d87bfe55b 100644
--- a/images/basic/mysql/Dockerfile
+++ b/images/basic/mysql/Dockerfile
@@ -1 +1 @@
-FROM mysql:8.0.19
+FROM mysql/mysql-server:8.0.19
diff --git a/images/basic/tomcat/Dockerfile.aarch64 b/images/basic/tomcat/Dockerfile.aarch64
new file mode 100644
index 000000000..ed4096de9
--- /dev/null
+++ b/images/basic/tomcat/Dockerfile.aarch64
@@ -0,0 +1 @@
+FROM arm64v8/tomcat:8.0
diff --git a/images/benchmarks/httpd/Dockerfile b/images/benchmarks/httpd/Dockerfile
index b72406012..e95538a40 100644
--- a/images/benchmarks/httpd/Dockerfile
+++ b/images/benchmarks/httpd/Dockerfile
@@ -8,7 +8,7 @@ RUN set -x \
 
 # Generate a bunch of relevant files.
 RUN mkdir -p /local && \
-        for size in 1 10 100 1000 1024 10240; do \
+        for size in 1 10 100 1024 10240; do \
                 dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \
         done
 
diff --git a/images/benchmarks/nginx/Dockerfile b/images/benchmarks/nginx/Dockerfile
index b64eb52ae..c8e3330d0 100644
--- a/images/benchmarks/nginx/Dockerfile
+++ b/images/benchmarks/nginx/Dockerfile
@@ -1 +1,12 @@
 FROM nginx:1.15.10
+
+# Generate a bunch of relevant files.
+RUN mkdir -p /local && \
+        for size in 1 10 100 1024 10240; do \
+                dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \
+        done
+
+RUN touch /local/index.html
+
+COPY ./nginx.conf /etc/nginx/nginx.conf
+COPY ./nginx_gofer.conf /etc/nginx/nginx_gofer.conf
diff --git a/images/benchmarks/nginx/nginx.conf b/images/benchmarks/nginx/nginx.conf
new file mode 100644
index 000000000..2c43c0cda
--- /dev/null
+++ b/images/benchmarks/nginx/nginx.conf
@@ -0,0 +1,19 @@
+user  nginx;
+worker_processes  1;
+daemon off;
+
+error_log  /var/log/nginx/error.log warn;
+pid        /var/run/nginx.pid;
+
+events {
+    worker_connections  1024;
+}
+
+
+http {
+    server {
+      location / {
+        root /tmp/html;
+      }
+    }
+}
diff --git a/images/benchmarks/nginx/nginx_gofer.conf b/images/benchmarks/nginx/nginx_gofer.conf
new file mode 100644
index 000000000..dbba2a575
--- /dev/null
+++ b/images/benchmarks/nginx/nginx_gofer.conf
@@ -0,0 +1,19 @@
+user  nginx;
+worker_processes  1;
+daemon off;
+
+error_log  /var/log/nginx/error.log warn;
+pid        /var/run/nginx.pid;
+
+events {
+    worker_connections  1024;
+}
+
+
+http {
+    server {
+      location / {
+        root /local;
+      }
+    }
+}
diff --git a/images/defs.bzl b/images/defs.bzl
new file mode 100644
index 000000000..c1f96e312
--- /dev/null
+++ b/images/defs.bzl
@@ -0,0 +1,34 @@
+"""Helpers for Docker image generation."""
+
+def _docker_image_impl(ctx):
+    importer = ctx.actions.declare_file(ctx.label.name)
+
+    importer_content = [
+        "#!/bin/bash",
+        "set -euo pipefail",
+        "source_file='%s'" % ctx.file.data.path,
+        "if [[ ! -f \"$source_file\" ]]; then",
+        "  source_file='%s'" % ctx.file.data.short_path,
+        "fi",
+        "exec docker import " + " ".join([
+            "-c '%s'" % attr
+            for attr in ctx.attr.statements
+        ]) + " \"$source_file\" $1",
+        "",
+    ]
+
+    ctx.actions.write(importer, "\n".join(importer_content), is_executable = True)
+    return [DefaultInfo(
+        runfiles = ctx.runfiles([ctx.file.data]),
+        executable = importer,
+    )]
+
+docker_image = rule(
+    implementation = _docker_image_impl,
+    doc = "Tool to import a Docker image; takes a single parameter (image name).",
+    attrs = {
+        "statements": attr.string_list(doc = "Extra Dockerfile directives."),
+        "data": attr.label(doc = "Image filesystem tarball", allow_single_file = [".tgz", ".tar.gz"]),
+    },
+    executable = True,
+)
diff --git a/images/jekyll/Dockerfile b/images/jekyll/Dockerfile.x86_64
index ba039ba15..ae19f3bfc 100644
--- a/images/jekyll/Dockerfile
+++ b/images/jekyll/Dockerfile.x86_64
@@ -1,5 +1,6 @@
 FROM jekyll/jekyll:4.0.0
 USER root
+
 RUN gem install \
         html-proofer:3.10.2 \
         nokogiri:1.10.1 \
@@ -10,5 +11,9 @@ RUN gem install \
         jekyll-relative-links:0.6.1 \
         jekyll-feed:0.13.0 \
         jekyll-sitemap:1.4.0
+
+# checks.rb is used with html-proofer for presubmit checks.
 COPY checks.rb /checks.rb
-CMD ["/usr/gem/gems/jekyll-4.0.0/exe/jekyll", "build", "-t", "-s", "/input", "-d", "/output"]
+
+COPY build.sh /build.sh
+CMD ["/build.sh"]
diff --git a/scripts/fuse_tests.sh b/images/jekyll/build.sh
index bbaaa99fc..010972ea6 100755
--- a/scripts/fuse_tests.sh
+++ b/images/jekyll/build.sh
@@ -14,7 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-source $(dirname $0)/common.sh
+set -euxo pipefail
 
-# Run all vfs2_fuse system call tests.
-test --test_tag_filters=fuse //test/fuse/...
+# Generate the syntax highlighting css file.
+/usr/gem/bin/rougify style github >/input/_sass/syntax.css
+# Build website including pages irrespective of date.
+/usr/gem/bin/jekyll build --future -t -s /input -d /output
diff --git a/images/packetdrill/Dockerfile b/images/packetdrill/Dockerfile
index 01296dbaf..b4cd73006 100644
--- a/images/packetdrill/Dockerfile
+++ b/images/packetdrill/Dockerfile
@@ -1,8 +1,8 @@
 FROM ubuntu:bionic
 RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
         netcat tcpdump jq tar bison flex make
+# Pick up updated git.
 RUN hash -r
 RUN git clone --depth 1 --branch packetdrill-v2.0 \
         https://github.com/google/packetdrill.git
 RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
-CMD /bin/bash
diff --git a/images/packetimpact/Dockerfile b/images/packetimpact/Dockerfile
index 87aa99ef2..906d5cdd6 100644
--- a/images/packetimpact/Dockerfile
+++ b/images/packetimpact/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:bionic
+FROM ubuntu:focal
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
         # iptables to disable OS native packet processing.
         iptables \
@@ -11,6 +11,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
         # tshark to log verbose packet sniffing.
         tshark \
         # killall for cleanup.
-        psmisc
-RUN hash -r
-CMD /bin/bash
+        psmisc \
+        # qemu-system-x86 to emulate fuchsia.
+        qemu-system-x86 \
+        # sha1sum to generate entropy.
+        libdigest-sha-perl
diff --git a/nogo.yaml b/nogo.yaml
new file mode 100644
index 000000000..07fd9aa4d
--- /dev/null
+++ b/nogo.yaml
@@ -0,0 +1,495 @@
+groups:
+  # We define three basic groups: generated (all generated files),
+  # exteranl (all files outside the repository), and internal (all
+  # files within the local repository). We can't enforce many style
+  # checks on generated and external code, so enable those cases
+  # selectively for analyzers below.
+  - name: generated
+    regex: "^(bazel-genfiles|bazel-out|bazel-bin)/"
+    default: true
+  - name: external
+    regex: "^external/"
+    default: false
+  - name: internal
+    regex: ".*"
+    default: true
+global:
+  generated:
+    suppress:
+      # Suppress the basic style checks for
+      # generated code, but keep the analysis
+      # that are required for quality & security.
+      - "should not use ALL_CAPS in Go names"
+      - "should not use underscores"
+      - "comment on exported"
+      - "methods on the same type should have the same receiver name"
+      - "at least one file in a package"
+      - "package comment should be of the form"
+      # Generated code may have dead code paths.
+      - "identical build constraints"
+      - "no value of type"
+      - "is never used"
+      # go_embed_data rules generate unicode literals.
+      - "string literal contains the Unicode format character"
+      - "string literal contains the Unicode control character"
+      # Some external code will generate protov1
+      # implementations. These should be ignored.
+      - "proto.* is deprecated"
+      - "xxx_messageInfo_.*"
+      - "receiver name should be a reflection of its identity"
+      # Generated gRPC code is not compliant either.
+      - "error strings should not be capitalized"
+      - "grpc.Errorf is deprecated"
+  internal:
+    suppress:
+      # We use ALL_CAPS for system definitions,
+      # which are common enough in the code base
+      # that we shouldn't annotate exceptions.
+      #
+      # Same story for underscores.
+      - "should not use ALL_CAPS in Go names"
+      - "should not use underscores in Go names"
+    exclude:
+      # A variety of staticcheck and stylecheck
+      # rules apply here. These should be fixed
+      # and removed from here, and the global
+      # rules should be used sparingly.
+      - pkg/abi/linux/fuse.go:22
+      - pkg/abi/linux/fuse.go:25
+      - pkg/abi/linux/socket.go:113
+      - pkg/abi/linux/tty.go:73
+      - pkg/bpf/decoder.go:112
+      - pkg/cpuid/cpuid_x86.go:675
+      - pkg/eventchannel/event.go:193
+      - pkg/eventchannel/event.go:27
+      - pkg/eventchannel/event_test.go:22
+      - pkg/eventchannel/rate.go:19
+      - pkg/gohacks/gohacks_unsafe.go:33
+      - pkg/log/json.go:30
+      - pkg/log/log.go:359
+      - pkg/merkletree/merkletree.go:230
+      - pkg/merkletree/merkletree.go:243
+      - pkg/merkletree/merkletree.go:249
+      - pkg/merkletree/merkletree.go:266
+      - pkg/merkletree/merkletree.go:355
+      - pkg/merkletree/merkletree.go:369
+      - pkg/metric/metric_test.go:20
+      - pkg/p9/p9test/client_test.go:687
+      - pkg/p9/transport_test.go:196
+      - pkg/pool/pool.go:15
+      - pkg/refs/refcounter.go:510
+      - pkg/refs/refcounter_test.go:169
+      - pkg/refs_vfs2/refs.go:16
+      - pkg/safemem/block_unsafe.go:89
+      - pkg/seccomp/seccomp.go:82
+      - pkg/segment/test/set_functions.go:15
+      - pkg/sentry/arch/signal.go:166
+      - pkg/sentry/arch/signal.go:171
+      - pkg/sentry/control/pprof.go:196
+      - pkg/sentry/devices/memdev/full.go:58
+      - pkg/sentry/devices/memdev/null.go:59
+      - pkg/sentry/devices/memdev/random.go:68
+      - pkg/sentry/devices/memdev/zero.go:86
+      - pkg/sentry/fdimport/fdimport.go:15
+      - pkg/sentry/fs/attr.go:257
+      - pkg/sentry/fsbridge/fs.go:116
+      - pkg/sentry/fsbridge/vfs.go:124
+      - pkg/sentry/fsbridge/vfs.go:70
+      - pkg/sentry/fs/copy_up.go:365
+      - pkg/sentry/fs/copy_up_test.go:65
+      - pkg/sentry/fs/dev/net_tun.go:161
+      - pkg/sentry/fs/dev/net_tun.go:63
+      - pkg/sentry/fs/dev/null.go:97
+      - pkg/sentry/fs/dirent_cache.go:64
+      - pkg/sentry/fs/file_overlay.go:327
+      - pkg/sentry/fs/file_overlay.go:524
+      - pkg/sentry/fs/filetest/filetest.go:55
+      - pkg/sentry/fs/filetest/filetest.go:60
+      - pkg/sentry/fs/fs.go:77
+      - pkg/sentry/fs/fsutil/file.go:290
+      - pkg/sentry/fs/fsutil/file.go:346
+      - pkg/sentry/fs/fsutil/host_file_mapper.go:105
+      - pkg/sentry/fs/fsutil/inode_cached.go:676
+      - pkg/sentry/fs/fsutil/inode_cached.go:772
+      - pkg/sentry/fs/gofer/attr.go:120
+      - pkg/sentry/fs/gofer/fifo.go:33
+      - pkg/sentry/fs/gofer/inode.go:410
+      - pkg/sentry/fsimpl/devpts/devpts.go:110
+      - pkg/sentry/fsimpl/devpts/devpts.go:246
+      - pkg/sentry/fsimpl/devpts/devpts.go:50
+      - pkg/sentry/fsimpl/devpts/master.go:110
+      - pkg/sentry/fsimpl/devpts/master.go:55
+      - pkg/sentry/fsimpl/devpts/replica.go:113
+      - pkg/sentry/fsimpl/devpts/replica.go:57
+      - pkg/sentry/fsimpl/devtmpfs/devtmpfs.go:54
+      - pkg/sentry/fsimpl/ext/disklayout/superblock_64.go:97
+      - pkg/sentry/fsimpl/ext/disklayout/superblock_old.go:92
+      - pkg/sentry/fsimpl/ext/disklayout/block_group_32.go:44
+      - pkg/sentry/fsimpl/ext/disklayout/inode_new.go:91
+      - pkg/sentry/fsimpl/ext/disklayout/inode_old.go:93
+      - pkg/sentry/fsimpl/ext/disklayout/superblock_32.go:66
+      - pkg/sentry/fsimpl/ext/disklayout/block_group_64.go:53
+      - pkg/sentry/fsimpl/eventfd/eventfd.go:268
+      - pkg/sentry/fsimpl/ext/directory.go:163
+      - pkg/sentry/fsimpl/ext/directory.go:164
+      - pkg/sentry/fsimpl/ext/extent_file.go:142
+      - pkg/sentry/fsimpl/ext/extent_file.go:143
+      - pkg/sentry/fsimpl/ext/ext.go:105
+      - pkg/sentry/fsimpl/ext/filesystem.go:287
+      - pkg/sentry/fsimpl/ext/regular_file.go:153
+      - pkg/sentry/fsimpl/ext/symlink.go:113
+      - pkg/sentry/fsimpl/fuse/connection_control.go:194
+      - pkg/sentry/fsimpl/fuse/dev.go:387
+      - pkg/sentry/fsimpl/fuse/dev_test.go:318
+      - pkg/sentry/fsimpl/fuse/fusefs.go:102
+      - pkg/sentry/fsimpl/fuse/read_write.go:129
+      - pkg/sentry/fsimpl/fuse/request_response.go:71
+      - pkg/sentry/fsimpl/gofer/directory.go:135
+      - pkg/sentry/fsimpl/gofer/filesystem.go:679
+      - pkg/sentry/fsimpl/gofer/gofer.go:1694
+      - pkg/sentry/fsimpl/gofer/gofer.go:276
+      - pkg/sentry/fsimpl/gofer/regular_file.go:81
+      - pkg/sentry/fsimpl/gofer/special_file.go:141
+      - pkg/sentry/fsimpl/host/host.go:184
+      - pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go:50
+      - pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go:90
+      - pkg/sentry/fsimpl/kernfs/fd_impl_util.go:273
+      - pkg/sentry/fsimpl/kernfs/filesystem.go:247
+      - pkg/sentry/fsimpl/kernfs/inode_impl_util.go:320
+      - pkg/sentry/fsimpl/kernfs/inode_impl_util.go:497
+      - pkg/sentry/fsimpl/kernfs/synthetic_directory.go:52
+      - pkg/sentry/fsimpl/overlay/directory.go:119
+      - pkg/sentry/fsimpl/overlay/filesystem.go:527
+      - pkg/sentry/fsimpl/overlay/non_directory.go:152
+      - pkg/sentry/fsimpl/overlay/overlay.go:115
+      - pkg/sentry/fsimpl/overlay/overlay.go:719
+      - pkg/sentry/fsimpl/pipefs/pipefs.go:74
+      - pkg/sentry/fsimpl/proc/filesystem.go:52
+      - pkg/sentry/fsimpl/proc/filesystem.go:81
+      - pkg/sentry/fsimpl/proc/subtasks.go:126
+      - pkg/sentry/fsimpl/proc/subtasks.go:189
+      - pkg/sentry/fsimpl/proc/task_fds.go:168
+      - pkg/sentry/fsimpl/proc/task_fds.go:228
+      - pkg/sentry/fsimpl/proc/task_fds.go:301
+      - pkg/sentry/fsimpl/proc/task_fds.go:318
+      - pkg/sentry/fsimpl/proc/task_fds.go:67
+      - pkg/sentry/fsimpl/proc/task_files.go:112
+      - pkg/sentry/fsimpl/proc/task_files.go:158
+      - pkg/sentry/fsimpl/proc/task_files.go:259
+      - pkg/sentry/fsimpl/proc/task_files.go:285
+      - pkg/sentry/fsimpl/proc/task_files.go:305
+      - pkg/sentry/fsimpl/proc/task_files.go:384
+      - pkg/sentry/fsimpl/proc/task_files.go:403
+      - pkg/sentry/fsimpl/proc/task_files.go:428
+      - pkg/sentry/fsimpl/proc/task_files.go:691
+      - pkg/sentry/fsimpl/proc/task_files.go:770
+      - pkg/sentry/fsimpl/proc/task_files.go:797
+      - pkg/sentry/fsimpl/proc/task_files.go:828
+      - pkg/sentry/fsimpl/proc/task_files.go:879
+      - pkg/sentry/fsimpl/proc/task_files.go:910
+      - pkg/sentry/fsimpl/proc/task_files.go:961
+      - pkg/sentry/fsimpl/proc/task.go:127
+      - pkg/sentry/fsimpl/proc/task.go:193
+      - pkg/sentry/fsimpl/proc/task_net.go:134
+      - pkg/sentry/fsimpl/proc/task_net.go:475
+      - pkg/sentry/fsimpl/proc/task_net.go:491
+      - pkg/sentry/fsimpl/proc/task_net.go:508
+      - pkg/sentry/fsimpl/proc/task_net.go:665
+      - pkg/sentry/fsimpl/proc/task_net.go:715
+      - pkg/sentry/fsimpl/proc/task_net.go:779
+      - pkg/sentry/fsimpl/proc/tasks_files.go:113
+      - pkg/sentry/fsimpl/proc/tasks_files.go:388
+      - pkg/sentry/fsimpl/proc/tasks.go:232
+      - pkg/sentry/fsimpl/proc/tasks_sys.go:145
+      - pkg/sentry/fsimpl/proc/tasks_sys.go:181
+      - pkg/sentry/fsimpl/proc/tasks_sys.go:239
+      - pkg/sentry/fsimpl/proc/tasks_sys.go:291
+      - pkg/sentry/fsimpl/proc/tasks_sys.go:375
+      - pkg/sentry/fsimpl/signalfd/signalfd.go:124
+      - pkg/sentry/fsimpl/signalfd/signalfd.go:15
+      - pkg/sentry/fsimpl/signalfd/signalfd.go:126
+      - pkg/sentry/fsimpl/sockfs/sockfs.go:36
+      - pkg/sentry/fsimpl/sockfs/sockfs.go:79
+      - pkg/sentry/fsimpl/sys/kcov.go:49
+      - pkg/sentry/fsimpl/sys/kcov.go:99
+      - pkg/sentry/fsimpl/sys/sys.go:118
+      - pkg/sentry/fsimpl/sys/sys.go:56
+      - pkg/sentry/fsimpl/testutil/testutil.go:257
+      - pkg/sentry/fsimpl/testutil/testutil.go:260
+      - pkg/sentry/fsimpl/timerfd/timerfd.go:87
+      - pkg/sentry/fsimpl/tmpfs/directory.go:112
+      - pkg/sentry/fsimpl/tmpfs/filesystem.go:195
+      - pkg/sentry/fsimpl/tmpfs/regular_file.go:226
+      - pkg/sentry/fsimpl/tmpfs/regular_file.go:346
+      - pkg/sentry/fsimpl/tmpfs/tmpfs.go:103
+      - pkg/sentry/fsimpl/tmpfs/tmpfs.go:733
+      - pkg/sentry/fsimpl/verity/filesystem.go:490
+      - pkg/sentry/fsimpl/verity/verity.go:156
+      - pkg/sentry/fsimpl/verity/verity.go:629
+      - pkg/sentry/fsimpl/verity/verity.go:672
+      - pkg/sentry/fs/mount.go:162
+      - pkg/sentry/fs/mount.go:256
+      - pkg/sentry/fs/mount_overlay.go:144
+      - pkg/sentry/fs/mounts.go:432
+      - pkg/sentry/fs/proc/exec_args.go:104
+      - pkg/sentry/fs/proc/exec_args.go:73
+      - pkg/sentry/fs/proc/fds.go:269
+      - pkg/sentry/fs/proc/loadavg.go:33
+      - pkg/sentry/fs/proc/meminfo.go:39
+      - pkg/sentry/fs/proc/mounts.go:193
+      - pkg/sentry/fs/proc/mounts.go:84
+      - pkg/sentry/fs/proc/net.go:125
+      - pkg/sentry/fs/proc/proc.go:146
+      - pkg/sentry/fs/proc/proc.go:204
+      - pkg/sentry/fs/proc/seqfile/seqfile.go:210
+      - pkg/sentry/fs/proc/sys.go:146
+      - pkg/sentry/fs/proc/sys.go:43
+      - pkg/sentry/fs/proc/sys_net.go:113
+      - pkg/sentry/fs/proc/sys_net.go:205
+      - pkg/sentry/fs/proc/sys_net.go:233
+      - pkg/sentry/fs/proc/sys_net.go:307
+      - pkg/sentry/fs/proc/sys_net.go:335
+      - pkg/sentry/fs/proc/sys_net.go:446
+      - pkg/sentry/fs/proc/sys_net.go:456
+      - pkg/sentry/fs/proc/sys_net.go:89
+      - pkg/sentry/fs/proc/task.go:170
+      - pkg/sentry/fs/proc/task.go:322
+      - pkg/sentry/fs/proc/task.go:427
+      - pkg/sentry/fs/proc/task.go:467
+      - pkg/sentry/fs/proc/task.go:500
+      - pkg/sentry/fs/proc/task.go:784
+      - pkg/sentry/fs/proc/task.go:839
+      - pkg/sentry/fs/proc/task.go:920
+      - pkg/sentry/fs/proc/uid_gid_map.go:108
+      - pkg/sentry/fs/proc/uid_gid_map.go:79
+      - pkg/sentry/fs/proc/uptime.go:75
+      - pkg/sentry/fs/ramfs/dir.go:447
+      - pkg/sentry/fs/tmpfs/inode_file.go:436
+      - pkg/sentry/fs/tmpfs/inode_file.go:537
+      - pkg/sentry/fs/tty/dir.go:313
+      - pkg/sentry/fs/tty/master.go:131
+      - pkg/sentry/fs/tty/master.go:91
+      - pkg/sentry/fs/tty/replica.go:116
+      - pkg/sentry/fs/tty/replica.go:88
+      - pkg/sentry/kernel/auth/id_map.go:269
+      - pkg/sentry/kernel/fasync/fasync.go:67
+      - pkg/sentry/kernel/kcov.go:209
+      - pkg/sentry/kernel/kcov.go:223
+      - pkg/sentry/kernel/kernel.go:343
+      - pkg/sentry/kernel/kernel.go:368
+      - pkg/sentry/kernel/pipe/node_test.go:112
+      - pkg/sentry/kernel/pipe/node_test.go:119
+      - pkg/sentry/kernel/pipe/node_test.go:130
+      - pkg/sentry/kernel/pipe/node_test.go:137
+      - pkg/sentry/kernel/pipe/node_test.go:149
+      - pkg/sentry/kernel/pipe/node_test.go:150
+      - pkg/sentry/kernel/pipe/node_test.go:158
+      - pkg/sentry/kernel/pipe/node_test.go:174
+      - pkg/sentry/kernel/pipe/node_test.go:180
+      - pkg/sentry/kernel/pipe/node_test.go:193
+      - pkg/sentry/kernel/pipe/node_test.go:202
+      - pkg/sentry/kernel/pipe/node_test.go:205
+      - pkg/sentry/kernel/pipe/node_test.go:216
+      - pkg/sentry/kernel/pipe/node_test.go:219
+      - pkg/sentry/kernel/pipe/node_test.go:271
+      - pkg/sentry/kernel/pipe/node_test.go:290
+      - pkg/sentry/kernel/pipe/pipe_test.go:93
+      - pkg/sentry/kernel/pipe/reader_writer.go:65
+      - pkg/sentry/kernel/posixtimer.go:157
+      - pkg/sentry/kernel/ptrace.go:218
+      - pkg/sentry/kernel/semaphore/semaphore.go:323
+      - pkg/sentry/kernel/sessions.go:123
+      - pkg/sentry/kernel/sessions.go:508
+      - pkg/sentry/kernel/signal_handlers.go:57
+      - pkg/sentry/kernel/task_context.go:72
+      - pkg/sentry/kernel/task_exit.go:67
+      - pkg/sentry/kernel/task_sched.go:255
+      - pkg/sentry/kernel/task_sched.go:280
+      - pkg/sentry/kernel/task_sched.go:323
+      - pkg/sentry/kernel/task_stop.go:192
+      - pkg/sentry/kernel/thread_group.go:530
+      - pkg/sentry/kernel/timekeeper.go:316
+      - pkg/sentry/kernel/vdso.go:106
+      - pkg/sentry/kernel/vdso.go:118
+      - pkg/sentry/memmap/memmap.go:103
+      - pkg/sentry/memmap/memmap.go:163
+      - pkg/sentry/mm/address_space.go:42
+      - pkg/sentry/mm/address_space.go:42
+      - pkg/sentry/mm/aio_context.go:208
+      - pkg/sentry/mm/aio_context.go:288
+      - pkg/sentry/mm/pma.go:683
+      - pkg/sentry/mm/special_mappable.go:80
+      - pkg/sentry/syscalls/linux/sys_sem.go:62
+      - pkg/sentry/syscalls/linux/sys_time.go:189
+      - pkg/sentry/usage/cpu.go:42
+      - pkg/sentry/vfs/anonfs.go:302
+      - pkg/sentry/vfs/anonfs.go:99
+      - pkg/sentry/vfs/dentry.go:214
+      - pkg/sentry/vfs/epoll.go:168
+      - pkg/sentry/vfs/epoll.go:314
+      - pkg/sentry/vfs/file_description.go:549
+      - pkg/sentry/vfs/file_description_impl_util.go:304
+      - pkg/sentry/vfs/file_description_impl_util.go:412
+      - pkg/sentry/vfs/filesystem.go:76
+      - pkg/sentry/vfs/lock.go:15
+      - pkg/sentry/vfs/lock.go:47
+      - pkg/sentry/vfs/memxattr/xattr.go:37
+      - pkg/sentry/vfs/mount.go:510
+      - pkg/sentry/vfs/mount.go:667
+      - pkg/sentry/vfs/mount_test.go:106
+      - pkg/sentry/vfs/mount_test.go:160
+      - pkg/sentry/vfs/mount_test.go:215
+      - pkg/sentry/vfs/mount_unsafe.go:153
+      - pkg/sentry/vfs/resolving_path.go:228
+      - pkg/sentry/vfs/vfs.go:897
+      - pkg/shim/runsc/runsc.go:16
+      - pkg/shim/runsc/utils.go:16
+      - pkg/shim/v1/proc/deleted_state.go:16
+      - pkg/shim/v1/proc/exec.go:16
+      - pkg/shim/v1/proc/exec_state.go:16
+      - pkg/shim/v1/proc/init.go:16
+      - pkg/shim/v1/proc/init_state.go:16
+      - pkg/shim/v1/proc/io.go:16
+      - pkg/shim/v1/proc/process.go:16
+      - pkg/shim/v1/proc/types.go:16
+      - pkg/shim/v1/proc/utils.go:16
+      - pkg/shim/v1/shim/api.go:16
+      - pkg/shim/v1/shim/platform.go:16
+      - pkg/shim/v1/shim/service.go:16
+      - pkg/shim/v1/utils/annotations.go:15
+      - pkg/shim/v1/utils/utils.go:15
+      - pkg/shim/v1/utils/volumes.go:15
+      - pkg/shim/v2/api.go:16
+      - pkg/shim/v2/epoll.go:18
+      - pkg/shim/v2/options/options.go:15
+      - pkg/shim/v2/options/options.go:24
+      - pkg/shim/v2/options/options.go:26
+      - pkg/shim/v2/runtimeoptions/runtimeoptions.go:16
+      - pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go # Generated: exempt all.
+      - pkg/shim/v2/runtimeoptions/runtimeoptions_test.go:22
+      - pkg/shim/v2/service.go:15
+      - pkg/shim/v2/service_linux.go:18
+      - pkg/state/tests/integer_test.go:23
+      - pkg/state/tests/integer_test.go:28
+      - pkg/sync/rwmutex_test.go:105
+      - pkg/syserr/host_linux.go:35
+      - pkg/unet/unet_test.go:634
+      - pkg/unet/unet_test.go:662
+      - pkg/unet/unet_test.go:703
+      - pkg/unet/unet_test.go:98
+      - pkg/usermem/addr.go:34
+      - pkg/usermem/usermem.go:171
+      - pkg/usermem/usermem.go:170
+      - runsc/boot/compat.go:22
+      - runsc/boot/compat.go:56
+      - runsc/boot/loader.go:1115
+      - runsc/boot/loader.go:1120
+      - runsc/cmd/checkpoint.go:151
+      - runsc/config/flags.go:32
+      - runsc/container/container.go:641
+      - runsc/container/container.go:988
+      - runsc/specutils/specutils.go:172
+      - runsc/specutils/specutils.go:428
+      - runsc/specutils/specutils.go:436
+      - runsc/specutils/specutils.go:442
+      - runsc/specutils/specutils.go:447
+      - runsc/specutils/specutils.go:454
+      - test/cmd/test_app/fds.go:171
+      - test/iptables/filter_output.go:251
+      - test/packetimpact/testbench/connections.go:77
+      - tools/bigquery/bigquery.go:106
+      - tools/checkescape/test1/test1.go:108
+      - tools/checkescape/test1/test1.go:122
+      - tools/checkescape/test1/test1.go:137
+      - tools/checkescape/test1/test1.go:151
+      - tools/checkescape/test1/test1.go:170
+      - tools/checkescape/test1/test1.go:39
+      - tools/checkescape/test1/test1.go:45
+      - tools/checkescape/test1/test1.go:50
+      - tools/checkescape/test1/test1.go:64
+      - tools/checkescape/test1/test1.go:80
+      - tools/checkescape/test1/test1.go:94
+      - tools/go_generics/imports.go:51
+      - tools/go_generics/imports.go:75
+      - tools/go_marshal/gomarshal/generator.go:177
+      - tools/go_marshal/gomarshal/generator.go:81
+      - tools/go_marshal/gomarshal/generator.go:85
+      - tools/go_marshal/test/escape/escape.go:15
+      - tools/go_marshal/test/test.go:164
+analyzers:
+  asmdecl:
+    external: # Enabled.
+  assign:
+    external:
+      exclude:
+        - gazelle/walk/walk.go
+  atomic:
+    external: # Enabled.
+  bools:
+    external: # Enabled.
+  buildtag:
+    external: # Enabled.
+  cgocall:
+    external: # Enabled.
+  shadow: # Disable for now.
+    generated:
+      exclude: [".*"]
+    internal:
+      exclude: [".*"]
+  composites: # Disable for now.
+    generated:
+      exclude: [".*"]
+    internal:
+      exclude: [".*"]
+  errorsas:
+    external: # Enabled.
+  httpresponse:
+    external: # Enabled.
+  loopclosure:
+    external: # Enabled.
+  nilfunc:
+    external: # Enabled.
+  nilness:
+    internal:
+      exclude:
+        - pkg/sentry/platform/kvm/kvm_test.go # Intentional.
+        - tools/bigquery/bigquery.go          # False positive.
+  printf:
+    external: # Enabled.
+  shift:
+    external: # Enabled.
+  stringintconv:
+    external:
+      exclude:
+        - ".*protobuf/.*.go"              # Bad conversions.
+        - ".*flate/huffman_bit_writer.go" # Bad conversion.
+        # Runtime internal violations.
+        - ".*reflect/value.go"
+        - ".*encoding/xml/xml.go"
+        - ".*runtime/pprof/internal/profile/proto.go"
+        - ".*fmt/scan.go"
+        - ".*go/types/conversions.go"
+        - ".*golang.org/x/net/dns/dnsmessage/message.go"
+  tests:
+    external: # Enabled.
+  unmarshal:
+    external: # Enabled.
+  unreachable:
+    external: # Enabled.
+  unsafeptr:
+    internal:
+      exclude:
+        - ".*_test.go"                                             # Exclude tests.
+        - "pkg/flipcall/.*_unsafe.go"                              # Special case.
+        - pkg/gohacks/gohacks_unsafe.go                            # Special case.
+        - pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go          # Special case.
+        - pkg/sentry/platform/kvm/bluepill_unsafe.go               # Special case.
+        - pkg/sentry/platform/kvm/machine_unsafe.go                # Special case.
+        - pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go # Special case.
+        - pkg/sentry/platform/safecopy/safecopy_unsafe.go          # Special case.
+        - pkg/sentry/vfs/mount_unsafe.go                           # Special case.
+        - pkg/state/decode_unsafe.go                               # Special case.
+  unusedresult:
+    external: # Enabled.
+  checkescape:
+    external: # Enabled.
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index b5c5cc20b..a0654df2f 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -38,6 +38,7 @@ go_library(
         "ipc.go",
         "limits.go",
         "linux.go",
+        "membarrier.go",
         "mm.go",
         "netdevice.go",
         "netfilter.go",
@@ -54,6 +55,8 @@ go_library(
         "sched.go",
         "seccomp.go",
         "sem.go",
+        "sem_amd64.go",
+        "sem_arm64.go",
         "shm.go",
         "signal.go",
         "signalfd.go",
@@ -74,9 +77,9 @@ go_library(
         "//pkg/abi",
         "//pkg/binary",
         "//pkg/bits",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 86ee3f8b5..5fc099892 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -42,6 +42,8 @@ const (
 //
 // The priority field is currently ignored in the implementation below. Also
 // note that the IOCB_FLAG_RESFD feature is not supported.
+//
+// +marshal
 type IOCallback struct {
 	Data uint64
 	Key  uint32
@@ -64,6 +66,7 @@ type IOCallback struct {
 
 // IOEvent describes an I/O result.
 //
+// +marshal
 // +stateify savable
 type IOEvent struct {
 	Data    uint64
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index aa3d3ce70..9422fcf69 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -16,6 +16,7 @@ package linux
 
 // BPFInstruction is a raw BPF virtual machine instruction.
 //
+// +marshal slice:BPFInstructionSlice
 // +stateify savable
 type BPFInstruction struct {
 	// OpCode is the operation to execute.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index 965f74663..afd16cc27 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -177,12 +177,16 @@ const (
 )
 
 // CapUserHeader is equivalent to Linux's cap_user_header_t.
+//
+// +marshal
 type CapUserHeader struct {
 	Version uint32
 	Pid     int32
 }
 
 // CapUserData is equivalent to Linux's cap_user_data_t.
+//
+// +marshal slice:CapUserDataSlice
 type CapUserData struct {
 	Effective   uint32
 	Permitted   uint32
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 192e2093b..7771650b3 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -54,9 +54,9 @@ const (
 	// Unix98 PTY masters.
 	UNIX98_PTY_MASTER_MAJOR = 128
 
-	// UNIX98_PTY_SLAVE_MAJOR is the initial major device number for
-	// Unix98 PTY slaves.
-	UNIX98_PTY_SLAVE_MAJOR = 136
+	// UNIX98_PTY_REPLICA_MAJOR is the initial major device number for
+	// Unix98 PTY replicas.
+	UNIX98_PTY_REPLICA_MAJOR = 136
 )
 
 // Minor device numbers for TTYAUX_MAJOR.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index 9242e80a5..cc3571fad 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -45,6 +45,8 @@ const (
 )
 
 // Flock is the lock structure for F_SETLK.
+//
+// +marshal
 type Flock struct {
 	Type   int16
 	Whence int16
@@ -63,6 +65,8 @@ const (
 )
 
 // FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+//
+// +marshal
 type FOwnerEx struct {
 	Type int32
 	PID  int32
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 158d2db5b..0d921ed6f 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -29,6 +29,7 @@ const (
 	SYSFS_MAGIC           = 0x62656572
 	TMPFS_MAGIC           = 0x01021994
 	V9FS_MAGIC            = 0x01021997
+	FUSE_SUPER_MAGIC      = 0x65735546
 )
 
 // Filesystem path limits, from uapi/linux/limits.h.
@@ -44,17 +45,18 @@ type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
 
-	// BlockSize is the data block size.
+	// BlockSize is the optimal transfer block size in bytes.
 	BlockSize int64
 
-	// Blocks is the number of data blocks in use.
+	// Blocks is the maximum number of data blocks the filesystem may store, in
+	// units of BlockSize.
 	Blocks uint64
 
-	// BlocksFree is the number of free blocks.
+	// BlocksFree is the number of free data blocks, in units of BlockSize.
 	BlocksFree uint64
 
-	// BlocksAvailable is the number of blocks free for use by
-	// unprivileged users.
+	// BlocksAvailable is the number of data blocks free for use by
+	// unprivileged users, in units of BlockSize.
 	BlocksAvailable uint64
 
 	// Files is the number of used file nodes on the filesystem.
diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go
index 7e30483ee..d91c97a64 100644
--- a/pkg/abi/linux/fuse.go
+++ b/pkg/abi/linux/fuse.go
@@ -14,12 +14,20 @@
 
 package linux
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+)
+
 // +marshal
 type FUSEOpcode uint32
 
 // +marshal
 type FUSEOpID uint64
 
+// FUSE_ROOT_ID is the id of root inode.
+const FUSE_ROOT_ID = 1
+
 // Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h.
 const (
 	FUSE_LOOKUP   FUSEOpcode = 1
@@ -116,61 +124,28 @@ type FUSEHeaderOut struct {
 	Unique FUSEOpID
 }
 
-// FUSEWriteIn is the header written by a daemon when it makes a
-// write request to the FUSE filesystem.
-//
-// +marshal
-type FUSEWriteIn struct {
-	// Fh specifies the file handle that is being written to.
-	Fh uint64
-
-	// Offset is the offset of the write.
-	Offset uint64
-
-	// Size is the size of data being written.
-	Size uint32
-
-	// WriteFlags is the flags used during the write.
-	WriteFlags uint32
-
-	// LockOwner is the ID of the lock owner.
-	LockOwner uint64
-
-	// Flags is the flags for the request.
-	Flags uint32
-
-	_ uint32
-}
-
 // FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h.
+// Our taget version is 7.23 but we have few implemented in advance.
 const (
-	FUSE_ASYNC_READ          = 1 << 0
-	FUSE_POSIX_LOCKS         = 1 << 1
-	FUSE_FILE_OPS            = 1 << 2
-	FUSE_ATOMIC_O_TRUNC      = 1 << 3
-	FUSE_EXPORT_SUPPORT      = 1 << 4
-	FUSE_BIG_WRITES          = 1 << 5
-	FUSE_DONT_MASK           = 1 << 6
-	FUSE_SPLICE_WRITE        = 1 << 7
-	FUSE_SPLICE_MOVE         = 1 << 8
-	FUSE_SPLICE_READ         = 1 << 9
-	FUSE_FLOCK_LOCKS         = 1 << 10
-	FUSE_HAS_IOCTL_DIR       = 1 << 11
-	FUSE_AUTO_INVAL_DATA     = 1 << 12
-	FUSE_DO_READDIRPLUS      = 1 << 13
-	FUSE_READDIRPLUS_AUTO    = 1 << 14
-	FUSE_ASYNC_DIO           = 1 << 15
-	FUSE_WRITEBACK_CACHE     = 1 << 16
-	FUSE_NO_OPEN_SUPPORT     = 1 << 17
-	FUSE_PARALLEL_DIROPS     = 1 << 18
-	FUSE_HANDLE_KILLPRIV     = 1 << 19
-	FUSE_POSIX_ACL           = 1 << 20
-	FUSE_ABORT_ERROR         = 1 << 21
-	FUSE_MAX_PAGES           = 1 << 22
-	FUSE_CACHE_SYMLINKS      = 1 << 23
-	FUSE_NO_OPENDIR_SUPPORT  = 1 << 24
-	FUSE_EXPLICIT_INVAL_DATA = 1 << 25
-	FUSE_MAP_ALIGNMENT       = 1 << 26
+	FUSE_ASYNC_READ       = 1 << 0
+	FUSE_POSIX_LOCKS      = 1 << 1
+	FUSE_FILE_OPS         = 1 << 2
+	FUSE_ATOMIC_O_TRUNC   = 1 << 3
+	FUSE_EXPORT_SUPPORT   = 1 << 4
+	FUSE_BIG_WRITES       = 1 << 5
+	FUSE_DONT_MASK        = 1 << 6
+	FUSE_SPLICE_WRITE     = 1 << 7
+	FUSE_SPLICE_MOVE      = 1 << 8
+	FUSE_SPLICE_READ      = 1 << 9
+	FUSE_FLOCK_LOCKS      = 1 << 10
+	FUSE_HAS_IOCTL_DIR    = 1 << 11
+	FUSE_AUTO_INVAL_DATA  = 1 << 12
+	FUSE_DO_READDIRPLUS   = 1 << 13
+	FUSE_READDIRPLUS_AUTO = 1 << 14
+	FUSE_ASYNC_DIO        = 1 << 15
+	FUSE_WRITEBACK_CACHE  = 1 << 16
+	FUSE_NO_OPEN_SUPPORT  = 1 << 17
+	FUSE_MAX_PAGES        = 1 << 22 // From FUSE 7.28
 )
 
 // currently supported FUSE protocol version numbers.
@@ -179,6 +154,13 @@ const (
 	FUSE_KERNEL_MINOR_VERSION = 31
 )
 
+// Constants relevant to FUSE operations.
+const (
+	FUSE_NAME_MAX     = 1024
+	FUSE_PAGE_SIZE    = 4096
+	FUSE_DIRENT_ALIGN = 8
+)
+
 // FUSEInitIn is the request sent by the kernel to the daemon,
 // to negotiate the version and flags.
 //
@@ -199,7 +181,7 @@ type FUSEInitIn struct {
 }
 
 // FUSEInitOut is the reply sent by the daemon to the kernel
-// for FUSEInitIn.
+// for FUSEInitIn. We target FUSE 7.23; this struct supports 7.28.
 //
 // +marshal
 type FUSEInitOut struct {
@@ -240,13 +222,16 @@ type FUSEInitOut struct {
 	// if the value from daemon is too large.
 	MaxPages uint16
 
-	// MapAlignment is an unknown field and not used by this package at this moment.
-	// Use as a placeholder to be consistent with the FUSE protocol.
-	MapAlignment uint16
+	_ uint16
 
 	_ [8]uint32
 }
 
+// FUSE_GETATTR_FH is currently the only flag of FUSEGetAttrIn.GetAttrFlags.
+// If it is set, the file handle (FUSEGetAttrIn.Fh) is used to indicate the
+// object instead of the node id attribute in the request header.
+const FUSE_GETATTR_FH = (1 << 0)
+
 // FUSEGetAttrIn is the request sent by the kernel to the daemon,
 // to get the attribute of a inode.
 //
@@ -267,22 +252,52 @@ type FUSEGetAttrIn struct {
 //
 // +marshal
 type FUSEAttr struct {
-	Ino       uint64
-	Size      uint64
-	Blocks    uint64
-	Atime     uint64
-	Mtime     uint64
-	Ctime     uint64
+	// Ino is the inode number of this file.
+	Ino uint64
+
+	// Size is the size of this file.
+	Size uint64
+
+	// Blocks is the number of the 512B blocks allocated by this file.
+	Blocks uint64
+
+	// Atime is the time of last access.
+	Atime uint64
+
+	// Mtime is the time of last modification.
+	Mtime uint64
+
+	// Ctime is the time of last status change.
+	Ctime uint64
+
+	// AtimeNsec is the nano second part of Atime.
 	AtimeNsec uint32
+
+	// MtimeNsec is the nano second part of Mtime.
 	MtimeNsec uint32
+
+	// CtimeNsec is the nano second part of Ctime.
 	CtimeNsec uint32
-	Mode      uint32
-	Nlink     uint32
-	UID       uint32
-	GID       uint32
-	Rdev      uint32
-	BlkSize   uint32
-	_         uint32
+
+	// Mode contains the file type and mode.
+	Mode uint32
+
+	// Nlink is the number of the hard links.
+	Nlink uint32
+
+	// UID is user ID of the owner.
+	UID uint32
+
+	// GID is group ID of the owner.
+	GID uint32
+
+	// Rdev is the device ID if this is a special file.
+	Rdev uint32
+
+	// BlkSize is the block size for filesystem I/O.
+	BlkSize uint32
+
+	_ uint32
 }
 
 // FUSEGetAttrOut is the reply sent by the daemon to the kernel
@@ -301,3 +316,558 @@ type FUSEGetAttrOut struct {
 	// Attr contains the metadata returned from the FUSE server
 	Attr FUSEAttr
 }
+
+// FUSEEntryOut is the reply sent by the daemon to the kernel
+// for FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and
+// FUSE_LOOKUP.
+//
+// +marshal
+type FUSEEntryOut struct {
+	// NodeID is the ID for current inode.
+	NodeID uint64
+
+	// Generation is the generation number of inode.
+	// Used to identify an inode that have different ID at different time.
+	Generation uint64
+
+	// EntryValid indicates timeout for an entry.
+	EntryValid uint64
+
+	// AttrValid indicates timeout for an entry's attributes.
+	AttrValid uint64
+
+	// EntryValidNsec indicates timeout for an entry in nanosecond.
+	EntryValidNSec uint32
+
+	// AttrValidNsec indicates timeout for an entry's attributes in nanosecond.
+	AttrValidNSec uint32
+
+	// Attr contains the attributes of an entry.
+	Attr FUSEAttr
+}
+
+// FUSELookupIn is the request sent by the kernel to the daemon
+// to look up a file name.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSELookupIn struct {
+	marshal.StubMarshallable
+
+	// Name is a file name to be looked up.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer.
+func (r *FUSELookupIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSELookupIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSELookupIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
+
+// MAX_NON_LFS indicates the maximum offset without large file support.
+const MAX_NON_LFS = ((1 << 31) - 1)
+
+// flags returned by OPEN request.
+const (
+	// FOPEN_DIRECT_IO indicates bypassing page cache for this opened file.
+	FOPEN_DIRECT_IO = 1 << 0
+	// FOPEN_KEEP_CACHE avoids invalidate of data cache on open.
+	FOPEN_KEEP_CACHE = 1 << 1
+	// FOPEN_NONSEEKABLE indicates the file cannot be seeked.
+	FOPEN_NONSEEKABLE = 1 << 2
+)
+
+// FUSEOpenIn is the request sent by the kernel to the daemon,
+// to negotiate flags and get file handle.
+//
+// +marshal
+type FUSEOpenIn struct {
+	// Flags of this open request.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEOpenOut is the reply sent by the daemon to the kernel
+// for FUSEOpenIn.
+//
+// +marshal
+type FUSEOpenOut struct {
+	// Fh is the file handler for opened file.
+	Fh uint64
+
+	// OpenFlag for the opened file.
+	OpenFlag uint32
+
+	_ uint32
+}
+
+// FUSE_READ flags, consistent with the ones in include/uapi/linux/fuse.h.
+const (
+	FUSE_READ_LOCKOWNER = 1 << 1
+)
+
+// FUSEReadIn is the request sent by the kernel to the daemon
+// for FUSE_READ.
+//
+// +marshal
+type FUSEReadIn struct {
+	// Fh is the file handle in userspace.
+	Fh uint64
+
+	// Offset is the read offset.
+	Offset uint64
+
+	// Size is the number of bytes to read.
+	Size uint32
+
+	// ReadFlags for this FUSE_READ request.
+	// Currently only contains FUSE_READ_LOCKOWNER.
+	ReadFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+
+	// Flags for the underlying file.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEWriteIn is the first part of the payload of the
+// request sent by the kernel to the daemon
+// for FUSE_WRITE (struct for FUSE version >= 7.9).
+//
+// The second part of the payload is the
+// binary bytes of the data to be written.
+//
+// +marshal
+type FUSEWriteIn struct {
+	// Fh is the file handle in userspace.
+	Fh uint64
+
+	// Offset is the write offset.
+	Offset uint64
+
+	// Size is the number of bytes to write.
+	Size uint32
+
+	// ReadFlags for this FUSE_WRITE request.
+	WriteFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+
+	// Flags for the underlying file.
+	Flags uint32
+
+	_ uint32
+}
+
+// FUSEWriteOut is the payload of the reply sent by the daemon to the kernel
+// for a FUSE_WRITE request.
+//
+// +marshal
+type FUSEWriteOut struct {
+	// Size is the number of bytes written.
+	Size uint32
+
+	_ uint32
+}
+
+// FUSEReleaseIn is the request sent by the kernel to the daemon
+// when there is no more reference to a file.
+//
+// +marshal
+type FUSEReleaseIn struct {
+	// Fh is the file handler for the file to be released.
+	Fh uint64
+
+	// Flags of the file.
+	Flags uint32
+
+	// ReleaseFlags of this release request.
+	ReleaseFlags uint32
+
+	// LockOwner is the id of the lock owner if there is one.
+	LockOwner uint64
+}
+
+// FUSECreateMeta contains all the static fields of FUSECreateIn,
+// which is used for FUSE_CREATE.
+//
+// +marshal
+type FUSECreateMeta struct {
+	// Flags of the creating file.
+	Flags uint32
+
+	// Mode is the mode of the creating file.
+	Mode uint32
+
+	// Umask is the current file mode creation mask.
+	Umask uint32
+	_     uint32
+}
+
+// FUSECreateIn contains all the arguments sent by the kernel to the daemon, to
+// atomically create and open a new regular file.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSECreateIn struct {
+	marshal.StubMarshallable
+
+	// CreateMeta contains mode, rdev and umash field for FUSE_MKNODS.
+	CreateMeta FUSECreateMeta
+
+	// Name is the name of the node to create.
+	Name string
+}
+
+// MarshalBytes serializes r.CreateMeta and r.Name to the dst buffer.
+func (r *FUSECreateIn) MarshalBytes(buf []byte) {
+	r.CreateMeta.MarshalBytes(buf[:r.CreateMeta.SizeBytes()])
+	copy(buf[r.CreateMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSECreateIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSECreateIn) SizeBytes() int {
+	return r.CreateMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSEMknodMeta contains all the static fields of FUSEMknodIn,
+// which is used for FUSE_MKNOD.
+//
+// +marshal
+type FUSEMknodMeta struct {
+	// Mode of the inode to create.
+	Mode uint32
+
+	// Rdev encodes device major and minor information.
+	Rdev uint32
+
+	// Umask is the current file mode creation mask.
+	Umask uint32
+
+	_ uint32
+}
+
+// FUSEMknodIn contains all the arguments sent by the kernel
+// to the daemon, to create a new file node.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEMknodIn struct {
+	marshal.StubMarshallable
+
+	// MknodMeta contains mode, rdev and umash field for FUSE_MKNODS.
+	MknodMeta FUSEMknodMeta
+
+	// Name is the name of the node to create.
+	Name string
+}
+
+// MarshalBytes serializes r.MknodMeta and r.Name to the dst buffer.
+func (r *FUSEMknodIn) MarshalBytes(buf []byte) {
+	r.MknodMeta.MarshalBytes(buf[:r.MknodMeta.SizeBytes()])
+	copy(buf[r.MknodMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEMknodIn.
+// 1 extra byte for null-terminated string.
+func (r *FUSEMknodIn) SizeBytes() int {
+	return r.MknodMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSESymLinkIn is the request sent by the kernel to the daemon,
+// to create a symbolic link.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSESymLinkIn struct {
+	marshal.StubMarshallable
+
+	// Name of symlink to create.
+	Name string
+
+	// Target of the symlink.
+	Target string
+}
+
+// MarshalBytes serializes r.Name and r.Target to the dst buffer.
+// Left null-termination at end of r.Name and r.Target.
+func (r *FUSESymLinkIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+	copy(buf[len(r.Name)+1:], r.Target)
+}
+
+// SizeBytes is the size of the memory representation of FUSESymLinkIn.
+// 2 extra bytes for null-terminated string.
+func (r *FUSESymLinkIn) SizeBytes() int {
+	return len(r.Name) + len(r.Target) + 2
+}
+
+// FUSEEmptyIn is used by operations without request body.
+type FUSEEmptyIn struct{ marshal.StubMarshallable }
+
+// MarshalBytes do nothing for marshal.
+func (r *FUSEEmptyIn) MarshalBytes(buf []byte) {}
+
+// SizeBytes is 0 for empty request.
+func (r *FUSEEmptyIn) SizeBytes() int {
+	return 0
+}
+
+// FUSEMkdirMeta contains all the static fields of FUSEMkdirIn,
+// which is used for FUSE_MKDIR.
+//
+// +marshal
+type FUSEMkdirMeta struct {
+	// Mode of the directory of create.
+	Mode uint32
+
+	// Umask is the user file creation mask.
+	Umask uint32
+}
+
+// FUSEMkdirIn contains all the arguments sent by the kernel
+// to the daemon, to create a new directory.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEMkdirIn struct {
+	marshal.StubMarshallable
+
+	// MkdirMeta contains Mode and Umask of the directory to create.
+	MkdirMeta FUSEMkdirMeta
+
+	// Name of the directory to create.
+	Name string
+}
+
+// MarshalBytes serializes r.MkdirMeta and r.Name to the dst buffer.
+func (r *FUSEMkdirIn) MarshalBytes(buf []byte) {
+	r.MkdirMeta.MarshalBytes(buf[:r.MkdirMeta.SizeBytes()])
+	copy(buf[r.MkdirMeta.SizeBytes():], r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEMkdirIn.
+// 1 extra byte for null-terminated Name string.
+func (r *FUSEMkdirIn) SizeBytes() int {
+	return r.MkdirMeta.SizeBytes() + len(r.Name) + 1
+}
+
+// FUSERmDirIn is the request sent by the kernel to the daemon
+// when trying to remove a directory.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSERmDirIn struct {
+	marshal.StubMarshallable
+
+	// Name is a directory name to be removed.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer.
+func (r *FUSERmDirIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSERmDirIn.
+func (r *FUSERmDirIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
+
+// FUSEDirents is a list of Dirents received from the FUSE daemon server.
+// It is used for FUSE_READDIR.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEDirents struct {
+	marshal.StubMarshallable
+
+	Dirents []*FUSEDirent
+}
+
+// FUSEDirent is a Dirent received from the FUSE daemon server.
+// It is used for FUSE_READDIR.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEDirent struct {
+	marshal.StubMarshallable
+
+	// Meta contains all the static fields of FUSEDirent.
+	Meta FUSEDirentMeta
+
+	// Name is the filename of the dirent.
+	Name string
+}
+
+// FUSEDirentMeta contains all the static fields of FUSEDirent.
+// It is used for FUSE_READDIR.
+//
+// +marshal
+type FUSEDirentMeta struct {
+	// Inode of the dirent.
+	Ino uint64
+
+	// Offset of the dirent.
+	Off uint64
+
+	// NameLen is the length of the dirent name.
+	NameLen uint32
+
+	// Type of the dirent.
+	Type uint32
+}
+
+// SizeBytes is the size of the memory representation of FUSEDirents.
+func (r *FUSEDirents) SizeBytes() int {
+	var sizeBytes int
+	for _, dirent := range r.Dirents {
+		sizeBytes += dirent.SizeBytes()
+	}
+
+	return sizeBytes
+}
+
+// UnmarshalBytes deserializes FUSEDirents from the src buffer.
+func (r *FUSEDirents) UnmarshalBytes(src []byte) {
+	for {
+		if len(src) <= (*FUSEDirentMeta)(nil).SizeBytes() {
+			break
+		}
+
+		// Its unclear how many dirents there are in src. Each dirent is dynamically
+		// sized and so we can't make assumptions about how many dirents we can allocate.
+		if r.Dirents == nil {
+			r.Dirents = make([]*FUSEDirent, 0)
+		}
+
+		// We have to allocate a struct for each dirent - there must be a better way
+		// to do this. Linux allocates 1 page to store all the dirents and then
+		// simply reads them from the page.
+		var dirent FUSEDirent
+		dirent.UnmarshalBytes(src)
+		r.Dirents = append(r.Dirents, &dirent)
+
+		src = src[dirent.SizeBytes():]
+	}
+}
+
+// SizeBytes is the size of the memory representation of FUSEDirent.
+func (r *FUSEDirent) SizeBytes() int {
+	dataSize := r.Meta.SizeBytes() + len(r.Name)
+
+	// Each Dirent must be padded such that its size is a multiple
+	// of FUSE_DIRENT_ALIGN. Similar to the fuse dirent alignment
+	// in linux/fuse.h.
+	return (dataSize + (FUSE_DIRENT_ALIGN - 1)) & ^(FUSE_DIRENT_ALIGN - 1)
+}
+
+// UnmarshalBytes deserializes FUSEDirent from the src buffer.
+func (r *FUSEDirent) UnmarshalBytes(src []byte) {
+	r.Meta.UnmarshalBytes(src)
+	src = src[r.Meta.SizeBytes():]
+
+	if r.Meta.NameLen > FUSE_NAME_MAX {
+		// The name is too long and therefore invalid. We don't
+		// need to unmarshal the name since it'll be thrown away.
+		return
+	}
+
+	buf := make([]byte, r.Meta.NameLen)
+	name := primitive.ByteSlice(buf)
+	name.UnmarshalBytes(src[:r.Meta.NameLen])
+	r.Name = string(name)
+}
+
+// FATTR_* consts are the attribute flags defined in include/uapi/linux/fuse.h.
+// These should be or-ed together for setattr to know what has been changed.
+const (
+	FATTR_MODE      = (1 << 0)
+	FATTR_UID       = (1 << 1)
+	FATTR_GID       = (1 << 2)
+	FATTR_SIZE      = (1 << 3)
+	FATTR_ATIME     = (1 << 4)
+	FATTR_MTIME     = (1 << 5)
+	FATTR_FH        = (1 << 6)
+	FATTR_ATIME_NOW = (1 << 7)
+	FATTR_MTIME_NOW = (1 << 8)
+	FATTR_LOCKOWNER = (1 << 9)
+	FATTR_CTIME     = (1 << 10)
+)
+
+// FUSESetAttrIn is the request sent by the kernel to the daemon,
+// to set the attribute(s) of a file.
+//
+// +marshal
+type FUSESetAttrIn struct {
+	// Valid indicates which attributes are modified by this request.
+	Valid uint32
+
+	_ uint32
+
+	// Fh is used to identify the file if FATTR_FH is set in Valid.
+	Fh uint64
+
+	// Size is the size that the request wants to change to.
+	Size uint64
+
+	// LockOwner is the owner of the lock that the request wants to change to.
+	LockOwner uint64
+
+	// Atime is the access time that the request wants to change to.
+	Atime uint64
+
+	// Mtime is the modification time that the request wants to change to.
+	Mtime uint64
+
+	// Ctime is the status change time that the request wants to change to.
+	Ctime uint64
+
+	// AtimeNsec is the nano second part of Atime.
+	AtimeNsec uint32
+
+	// MtimeNsec is the nano second part of Mtime.
+	MtimeNsec uint32
+
+	// CtimeNsec is the nano second part of Ctime.
+	CtimeNsec uint32
+
+	// Mode is the file mode that the request wants to change to.
+	Mode uint32
+
+	_ uint32
+
+	// UID is the user ID of the owner that the request wants to change to.
+	UID uint32
+
+	// GID is the group ID of the owner that the request wants to change to.
+	GID uint32
+
+	_ uint32
+}
+
+// FUSEUnlinkIn is the request sent by the kernel to the daemon
+// when trying to unlink a node.
+//
+// Dynamically-sized objects cannot be marshalled.
+type FUSEUnlinkIn struct {
+	marshal.StubMarshallable
+
+	// Name of the node to unlink.
+	Name string
+}
+
+// MarshalBytes serializes r.name to the dst buffer, which should
+// have size len(r.Name) + 1 and last byte set to 0.
+func (r *FUSEUnlinkIn) MarshalBytes(buf []byte) {
+	copy(buf, r.Name)
+}
+
+// SizeBytes is the size of the memory representation of FUSEUnlinkIn.
+// 1 extra byte for null-terminated Name string.
+func (r *FUSEUnlinkIn) SizeBytes() int {
+	return len(r.Name) + 1
+}
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 2c5e56ae5..7df02dd6d 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -113,7 +113,57 @@ const (
 	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
 )
 
+// Constants from uapi/linux/fs.h.
+const (
+	FS_IOC_GETFLAGS = 2148034049
+	FS_VERITY_FL    = 1048576
+)
+
+// Constants from uapi/linux/fsverity.h.
+const (
+	FS_IOC_ENABLE_VERITY  = 1082156677
+	FS_IOC_MEASURE_VERITY = 3221513862
+)
+
+// DigestMetadata is a helper struct for VerityDigest.
+//
+// +marshal
+type DigestMetadata struct {
+	DigestAlgorithm uint16
+	DigestSize      uint16
+}
+
+// SizeOfDigestMetadata is the size of struct DigestMetadata.
+const SizeOfDigestMetadata = 4
+
+// VerityDigest is struct from uapi/linux/fsverity.h.
+type VerityDigest struct {
+	Metadata DigestMetadata
+	Digest   []byte
+}
+
 // IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
 func IOC(dir, typ, nr, size uint32) uint32 {
 	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
 }
+
+// Kcov ioctls from kernel/kcov.h.
+var (
+	KCOV_INIT_TRACE = IOC(_IOC_READ, 'c', 1, 8)
+	KCOV_ENABLE     = IOC(_IOC_NONE, 'c', 100, 0)
+	KCOV_DISABLE    = IOC(_IOC_NONE, 'c', 101, 0)
+)
+
+// Kcov trace types from kernel/kcov.h.
+const (
+	KCOV_TRACE_PC  = 0
+	KCOV_TRACE_CMP = 1
+)
+
+// Kcov state constants from kernel/kcov.h.
+const (
+	KCOV_MODE_DISABLED  = 0
+	KCOV_MODE_INIT      = 1
+	KCOV_MODE_TRACE_PC  = 2
+	KCOV_MODE_TRACE_CMP = 3
+)
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 22acd2d43..c6e65df62 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -37,6 +37,8 @@ const IPC_PRIVATE = 0
 // features like 32-bit UIDs.
 
 // IPCPerm is equivalent to struct ipc64_perm.
+//
+// +marshal
 type IPCPerm struct {
 	Key     uint32
 	UID     uint32
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index 281acdbde..3b4abece1 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package linux contains the constants and types needed to interface with a Linux kernel.
+// Package linux contains the constants and types needed to interface with a
+// Linux kernel.
 package linux
 
 // NumSoftIRQ is the number of software IRQs, exposed via /proc/stat.
@@ -21,6 +22,8 @@ package linux
 const NumSoftIRQ = 10
 
 // Sysinfo is the structure provided by sysinfo on linux versions > 2.3.48.
+//
+// +marshal
 type Sysinfo struct {
 	Uptime    int64
 	Loads     [3]uint64
@@ -34,6 +37,6 @@ type Sysinfo struct {
 	_         [6]byte // Pad Procs to 64bits.
 	TotalHigh uint64
 	FreeHigh  uint64
-	Unit      uint32
-	/* The _f field in the glibc version of Sysinfo has size 0 on AMD64 */
+	Unit      uint32 `marshal:"unaligned"` // Struct ends mid-64-bit-word.
+	// The _f field in the glibc version of Sysinfo has size 0 on AMD64.
 }
diff --git a/pkg/abi/linux/membarrier.go b/pkg/abi/linux/membarrier.go
new file mode 100644
index 000000000..4f6021a1d
--- /dev/null
+++ b/pkg/abi/linux/membarrier.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// membarrier(2) commands, from include/uapi/linux/membarrier.h.
+const (
+	MEMBARRIER_CMD_QUERY                                = 0
+	MEMBARRIER_CMD_GLOBAL                               = (1 << 0)
+	MEMBARRIER_CMD_GLOBAL_EXPEDITED                     = (1 << 1)
+	MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED            = (1 << 2)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED                    = (1 << 3)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED           = (1 << 4)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE          = (1 << 5)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6)
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ               = (1 << 7)
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ      = (1 << 8)
+)
+
+// membarrier(2) flags, from include/uapi/linux/membarrier.h.
+const (
+	MEMBARRIER_CMD_FLAG_CPU = (1 << 0)
+)
diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 91e35366f..b521144d9 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -17,9 +17,9 @@ package linux
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // This file contains structures required to support netfilter, specifically
@@ -265,6 +265,18 @@ type KernelXTEntryMatch struct {
 	Data []byte
 }
 
+// XTGetRevision corresponds to xt_get_revision in
+// include/uapi/linux/netfilter/x_tables.h
+//
+// +marshal
+type XTGetRevision struct {
+	Name     ExtensionName
+	Revision uint8
+}
+
+// SizeOfXTGetRevision is the size of an XTGetRevision.
+const SizeOfXTGetRevision = 30
+
 // XTEntryTarget holds a target for a rule. For example, it can specify that
 // packets matching the rule should DROP, ACCEPT, or use an extension target.
 // iptables-extension(8) has a list of possible targets.
@@ -285,6 +297,13 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
+// KernelXTEntryTarget is identical to XTEntryTarget, but contains a
+// variable-length Data field.
+type KernelXTEntryTarget struct {
+	XTEntryTarget
+	Data []byte
+}
+
 // XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
 // RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
@@ -450,9 +469,9 @@ func (ke *KernelIPTGetEntries) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements marshal.Marshallable.CopyIn.
-func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
-	length, err := task.CopyInBytes(addr, buf)    // escapes: okay.
+func (ke *KernelIPTGetEntries) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
+	length, err := cc.CopyInBytes(addr, buf)    // escapes: okay.
 	// Unmarshal unconditionally. If we had a short copy-in, this results in a
 	// partially unmarshalled struct.
 	ke.UnmarshalBytes(buf) // escapes: fallback.
@@ -460,21 +479,21 @@ func (ke *KernelIPTGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int
 }
 
 // CopyOut implements marshal.Marshallable.CopyOut.
-func (ke *KernelIPTGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+func (ke *KernelIPTGetEntries) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
 	// Type KernelIPTGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task))
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc))
 }
 
 // CopyOutN implements marshal.Marshallable.CopyOutN.
-func (ke *KernelIPTGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+func (ke *KernelIPTGetEntries) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
 	// Type KernelIPTGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit])
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc)[:limit])
 }
 
-func (ke *KernelIPTGetEntries) marshalAll(task marshal.Task) []byte {
-	buf := task.CopyScratchBuffer(ke.SizeBytes())
+func (ke *KernelIPTGetEntries) marshalAll(cc marshal.CopyContext) []byte {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes())
 	ke.MarshalBytes(buf)
 	return buf
 }
@@ -510,6 +529,8 @@ type IPTReplace struct {
 const SizeOfIPTReplace = 96
 
 // ExtensionName holds the name of a netfilter extension.
+//
+// +marshal
 type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte
 
 // String implements fmt.Stringer.
diff --git a/pkg/abi/linux/netfilter_ipv6.go b/pkg/abi/linux/netfilter_ipv6.go
index 9bb9efb10..6d31eb5e3 100644
--- a/pkg/abi/linux/netfilter_ipv6.go
+++ b/pkg/abi/linux/netfilter_ipv6.go
@@ -17,9 +17,9 @@ package linux
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // This file contains structures required to support IPv6 netfilter and
@@ -128,9 +128,9 @@ func (ke *KernelIP6TGetEntries) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements marshal.Marshallable.CopyIn.
-func (ke *KernelIP6TGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	buf := task.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
-	length, err := task.CopyInBytes(addr, buf)    // escapes: okay.
+func (ke *KernelIP6TGetEntries) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay.
+	length, err := cc.CopyInBytes(addr, buf)    // escapes: okay.
 	// Unmarshal unconditionally. If we had a short copy-in, this results
 	// in a partially unmarshalled struct.
 	ke.UnmarshalBytes(buf) // escapes: fallback.
@@ -138,21 +138,21 @@ func (ke *KernelIP6TGetEntries) CopyIn(task marshal.Task, addr usermem.Addr) (in
 }
 
 // CopyOut implements marshal.Marshallable.CopyOut.
-func (ke *KernelIP6TGetEntries) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+func (ke *KernelIP6TGetEntries) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
 	// Type KernelIP6TGetEntries doesn't have a packed layout in memory,
 	// fall back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task))
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc))
 }
 
 // CopyOutN implements marshal.Marshallable.CopyOutN.
-func (ke *KernelIP6TGetEntries) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+func (ke *KernelIP6TGetEntries) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
 	// Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall
 	// back to MarshalBytes.
-	return task.CopyOutBytes(addr, ke.marshalAll(task)[:limit])
+	return cc.CopyOutBytes(addr, ke.marshalAll(cc)[:limit])
 }
 
-func (ke *KernelIP6TGetEntries) marshalAll(task marshal.Task) []byte {
-	buf := task.CopyScratchBuffer(ke.SizeBytes())
+func (ke *KernelIP6TGetEntries) marshalAll(cc marshal.CopyContext) []byte {
+	buf := cc.CopyScratchBuffer(ke.SizeBytes())
 	ke.MarshalBytes(buf)
 	return buf
 }
@@ -290,6 +290,19 @@ type IP6TIP struct {
 
 const SizeOfIP6TIP = 136
 
+// Flags in IP6TIP.Flags. Corresponding constants are in
+// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
+const (
+	// Whether to check the Protocol field.
+	IP6T_F_PROTO = 0x01
+	// Whether to match the TOS field.
+	IP6T_F_TOS = 0x02
+	// Indicates that the jump target is an aboslute GOTO, not an offset.
+	IP6T_F_GOTO = 0x04
+	// Enables all flags.
+	IP6T_F_MASK = 0x07
+)
+
 // Flags in IP6TIP.InverseFlags. Corresponding constants are in
 // include/uapi/linux/netfilter_ipv6/ip6_tables.h.
 const (
@@ -308,3 +321,16 @@ const (
 	// Enable all flags.
 	IP6T_INV_MASK = 0x7F
 )
+
+// NFNATRange corresponds to struct nf_nat_range in
+// include/uapi/linux/netfilter/nf_nat.h.
+type NFNATRange struct {
+	Flags    uint32
+	MinAddr  Inet6Addr
+	MaxAddr  Inet6Addr
+	MinProto uint16 // Network byte order.
+	MaxProto uint16 // Network byte order.
+}
+
+// SizeOfNFNATRange is the size of NFNATRange.
+const SizeOfNFNATRange = 40
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 0ba086c76..b41f94a69 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -40,6 +40,8 @@ const (
 )
 
 // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h.
+//
+// +marshal
 type SockAddrNetlink struct {
 	Family uint16
 	_      uint16
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index c04d26e4c..3443a5768 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -15,6 +15,8 @@
 package linux
 
 // PollFD is struct pollfd, used by poll(2)/ppoll(2), from uapi/asm-generic/poll.h.
+//
+// +marshal slice:PollFDSlice
 type PollFD struct {
 	FD      int32
 	Events  int16
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index d8302dc85..e29d0ac7e 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -26,6 +26,8 @@ const (
 )
 
 // Rusage represents the Linux struct rusage.
+//
+// +marshal
 type Rusage struct {
 	UTime    Timeval
 	STime    Timeval
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index d0607e256..5be3f10f9 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -34,11 +34,11 @@ type BPFAction uint32
 
 const (
 	SECCOMP_RET_KILL_PROCESS BPFAction = 0x80000000
-	SECCOMP_RET_KILL_THREAD            = 0x00000000
-	SECCOMP_RET_TRAP                   = 0x00030000
-	SECCOMP_RET_ERRNO                  = 0x00050000
-	SECCOMP_RET_TRACE                  = 0x7ff00000
-	SECCOMP_RET_ALLOW                  = 0x7fff0000
+	SECCOMP_RET_KILL_THREAD  BPFAction = 0x00000000
+	SECCOMP_RET_TRAP         BPFAction = 0x00030000
+	SECCOMP_RET_ERRNO        BPFAction = 0x00050000
+	SECCOMP_RET_TRACE        BPFAction = 0x7ff00000
+	SECCOMP_RET_ALLOW        BPFAction = 0x7fff0000
 )
 
 func (a BPFAction) String() string {
@@ -64,9 +64,41 @@ func (a BPFAction) Data() uint16 {
 	return uint16(a & SECCOMP_RET_DATA)
 }
 
+// WithReturnCode sets the lower 16 bits of the SECCOMP_RET_ERRNO or
+// SECCOMP_RET_TRACE actions to the provided return code, overwriting the previous
+// action, and returns a new BPFAction. If not SECCOMP_RET_ERRNO or
+// SECCOMP_RET_TRACE then this panics.
+func (a BPFAction) WithReturnCode(code uint16) BPFAction {
+	// mask out the previous return value
+	baseAction := a & SECCOMP_RET_ACTION_FULL
+	if baseAction == SECCOMP_RET_ERRNO || baseAction == SECCOMP_RET_TRACE {
+		return BPFAction(uint32(baseAction) | uint32(code))
+	}
+	panic("WithReturnCode only valid for SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE")
+}
+
 // SockFprog is sock_fprog taken from <linux/filter.h>.
 type SockFprog struct {
 	Len    uint16
 	pad    [6]byte
 	Filter *BPFInstruction
 }
+
+// SeccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+//
+// +marshal
+type SeccompData struct {
+	// Nr is the system call number.
+	Nr int32
+
+	// Arch is an AUDIT_ARCH_* value indicating the system call convention.
+	Arch uint32
+
+	// InstructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	InstructionPointer uint64
+
+	// Args contains the first 6 system call arguments.
+	Args [6]uint64
+}
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index de422c519..1b2f76c0b 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -34,17 +34,9 @@ const (
 
 const SEM_UNDO = 0x1000
 
-// SemidDS is equivalent to struct semid64_ds.
-type SemidDS struct {
-	SemPerm  IPCPerm
-	SemOTime TimeT
-	SemCTime TimeT
-	SemNSems uint64
-	unused3  uint64
-	unused4  uint64
-}
-
 // Sembuf is equivalent to struct sembuf.
+//
+// +marshal slice:SembufSlice
 type Sembuf struct {
 	SemNum uint16
 	SemOp  int16
diff --git a/pkg/abi/linux/sem_amd64.go b/pkg/abi/linux/sem_amd64.go
new file mode 100644
index 000000000..ab980cb4f
--- /dev/null
+++ b/pkg/abi/linux/sem_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// SemidDS is equivalent to struct semid64_ds.
+//
+// Source: arch/x86/include/uapi/asm/sembuf.h
+//
+// +marshal
+type SemidDS struct {
+	SemPerm  IPCPerm
+	SemOTime TimeT
+	unused1  uint64
+	SemCTime TimeT
+	unused2  uint64
+	SemNSems uint64
+	unused3  uint64
+	unused4  uint64
+}
diff --git a/pkg/abi/linux/sem_arm64.go b/pkg/abi/linux/sem_arm64.go
new file mode 100644
index 000000000..521468fb1
--- /dev/null
+++ b/pkg/abi/linux/sem_arm64.go
@@ -0,0 +1,31 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+// SemidDS is equivalent to struct semid64_ds.
+//
+// Source: include/uapi/asm-generic/sembuf.h
+//
+// +marshal
+type SemidDS struct {
+	SemPerm  IPCPerm
+	SemOTime TimeT
+	SemCTime TimeT
+	SemNSems uint64
+	unused3  uint64
+	unused4  uint64
+}
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index e45aadb10..274b1e847 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -51,6 +51,8 @@ const (
 
 // ShmidDS is equivalent to struct shmid64_ds. Source:
 // include/uapi/asm-generic/shmbuf.h
+//
+// +marshal
 type ShmidDS struct {
 	ShmPerm    IPCPerm
 	ShmSegsz   uint64
@@ -66,6 +68,8 @@ type ShmidDS struct {
 }
 
 // ShmParams is equivalent to struct shminfo. Source: include/uapi/linux/shm.h
+//
+// +marshal
 type ShmParams struct {
 	ShmMax uint64
 	ShmMin uint64
@@ -75,6 +79,8 @@ type ShmParams struct {
 }
 
 // ShmInfo is equivalent to struct shm_info. Source: include/uapi/linux/shm.h
+//
+// +marshal
 type ShmInfo struct {
 	UsedIDs       int32 // Number of currently existing segments.
 	_             [4]byte
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index 1c330e763..6ca57ffbb 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -214,6 +214,8 @@ const (
 )
 
 // Sigevent represents struct sigevent.
+//
+// +marshal
 type Sigevent struct {
 	Value  uint64 // union sigval {int, void*}
 	Signo  int32
diff --git a/pkg/abi/linux/signalfd.go b/pkg/abi/linux/signalfd.go
index 85fad9956..468c6a387 100644
--- a/pkg/abi/linux/signalfd.go
+++ b/pkg/abi/linux/signalfd.go
@@ -23,6 +23,8 @@ const (
 )
 
 // SignalfdSiginfo is the siginfo encoding for signalfds.
+//
+// +marshal
 type SignalfdSiginfo struct {
 	Signo   uint32
 	Errno   int32
@@ -41,5 +43,5 @@ type SignalfdSiginfo struct {
 	STime   uint64
 	Addr    uint64
 	AddrLSB uint16
-	_       [48]uint8
+	_       [48]uint8 `marshal:"unaligned"`
 }
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index e37c8727d..d156d41e4 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -14,7 +14,10 @@
 
 package linux
 
-import "gvisor.dev/gvisor/pkg/binary"
+import (
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+)
 
 // Address families, from linux/socket.h.
 const (
@@ -265,6 +268,8 @@ type InetMulticastRequestWithNIC struct {
 type Inet6Addr [16]byte
 
 // SockAddrInet6 is struct sockaddr_in6, from uapi/linux/in6.h.
+//
+// +marshal
 type SockAddrInet6 struct {
 	Family   uint16
 	Port     uint16
@@ -274,6 +279,8 @@ type SockAddrInet6 struct {
 }
 
 // SockAddrLink is a struct sockaddr_ll, from uapi/linux/if_packet.h.
+//
+// +marshal
 type SockAddrLink struct {
 	Family          uint16
 	Protocol        uint16
@@ -290,6 +297,8 @@ type SockAddrLink struct {
 const UnixPathMax = 108
 
 // SockAddrUnix is struct sockaddr_un, from uapi/linux/un.h.
+//
+// +marshal
 type SockAddrUnix struct {
 	Family uint16
 	Path   [UnixPathMax]int8
@@ -299,6 +308,8 @@ type SockAddrUnix struct {
 // equivalent to struct sockaddr. SockAddr ensures that a well-defined set of
 // types can be used as socket addresses.
 type SockAddr interface {
+	marshal.Marshallable
+
 	// implementsSockAddr exists purely to allow a type to indicate that they
 	// implement this interface. This method is a no-op and shouldn't be called.
 	implementsSockAddr()
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index e6860ed49..206f5af7e 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -93,6 +93,8 @@ const (
 const maxSecInDuration = math.MaxInt64 / int64(time.Second)
 
 // TimeT represents time_t in <time.h>. It represents time in seconds.
+//
+// +marshal
 type TimeT int64
 
 // NsecToTimeT translates nanoseconds to TimeT (seconds).
@@ -102,7 +104,7 @@ func NsecToTimeT(nsec int64) TimeT {
 
 // Timespec represents struct timespec in <time.h>.
 //
-// +marshal
+// +marshal slice:TimespecSlice
 type Timespec struct {
 	Sec  int64
 	Nsec int64
@@ -158,7 +160,7 @@ const SizeOfTimeval = 16
 
 // Timeval represents struct timeval in <time.h>.
 //
-// +marshal
+// +marshal slice:TimevalSlice
 type Timeval struct {
 	Sec  int64
 	Usec int64
@@ -196,6 +198,8 @@ func DurationToTimeval(dur time.Duration) Timeval {
 }
 
 // Itimerspec represents struct itimerspec in <time.h>.
+//
+// +marshal
 type Itimerspec struct {
 	Interval Timespec
 	Value    Timespec
@@ -206,12 +210,16 @@ type Itimerspec struct {
 //     struct timeval it_interval; /* next value */
 //     struct timeval it_value;    /* current value */
 //   };
+//
+// +marshal
 type ItimerVal struct {
 	Interval Timeval
 	Value    Timeval
 }
 
 // ClockT represents type clock_t.
+//
+// +marshal
 type ClockT int64
 
 // ClockTFromDuration converts time.Duration to clock_t.
@@ -220,6 +228,8 @@ func ClockTFromDuration(d time.Duration) ClockT {
 }
 
 // Tms represents struct tms, used by times(2).
+//
+// +marshal
 type Tms struct {
 	UTime  ClockT
 	STime  ClockT
@@ -229,6 +239,8 @@ type Tms struct {
 
 // TimerID represents type timer_t, which identifies a POSIX per-process
 // interval timer.
+//
+// +marshal
 type TimerID int32
 
 // StatxTimestamp represents struct statx_timestamp.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 8ac02aee8..47e65d9fb 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -23,6 +23,8 @@ const (
 )
 
 // Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
+//
+// +marshal
 type Winsize struct {
 	Row    uint16
 	Col    uint16
@@ -31,6 +33,8 @@ type Winsize struct {
 }
 
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
+//
+// +marshal
 type Termios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
@@ -321,9 +325,9 @@ var MasterTermios = KernelTermios{
 	OutputSpeed:       38400,
 }
 
-// DefaultSlaveTermios is the default terminal configuration of the slave end
-// of a Unix98 pseudoterminal.
-var DefaultSlaveTermios = KernelTermios{
+// DefaultReplicaTermios is the default terminal configuration of the replica
+// end of a Unix98 pseudoterminal.
+var DefaultReplicaTermios = KernelTermios{
 	InputFlags:        ICRNL | IXON,
 	OutputFlags:       OPOST | ONLCR,
 	ControlFlags:      B38400 | CS8 | CREAD,
@@ -337,6 +341,7 @@ var DefaultSlaveTermios = KernelTermios{
 // include/uapi/asm-generic/termios.h.
 //
 // +stateify savable
+// +marshal
 type WindowSize struct {
 	Rows uint16
 	Cols uint16
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index 60f220a67..cb7c95437 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -26,6 +26,8 @@ const (
 )
 
 // UtsName represents struct utsname, the struct returned by uname(2).
+//
+// +marshal
 type UtsName struct {
 	Sysname    [UTSLen + 1]byte
 	Nodename   [UTSLen + 1]byte
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index 99180b208..8ef837f27 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -23,6 +23,9 @@ const (
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
 
+	XATTR_TRUSTED_PREFIX     = "trusted."
+	XATTR_TRUSTED_PREFIX_LEN = len(XATTR_TRUSTED_PREFIX)
+
 	XATTR_USER_PREFIX     = "user."
 	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
 )
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index ffc918846..bd3a5cce9 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -6,7 +6,10 @@ go_library(
     name = "amutex",
     srcs = ["amutex.go"],
     visibility = ["//:sandbox"],
-    deps = ["//pkg/syserror"],
+    deps = [
+        "//pkg/context",
+        "//pkg/syserror",
+    ],
 )
 
 go_test(
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index a078a31db..d7acc1d9f 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -19,41 +19,17 @@ package amutex
 import (
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // Sleeper must be implemented by users of the abortable mutex to allow for
 // cancellation of waits.
-type Sleeper interface {
-	// SleepStart is called by the AbortableMutex.Lock() function when the
-	// mutex is contended and the goroutine is about to sleep.
-	//
-	// A channel can be returned that causes the sleep to be canceled if
-	// it's readable. If no cancellation is desired, nil can be returned.
-	SleepStart() <-chan struct{}
-
-	// SleepFinish is called by AbortableMutex.Lock() once a contended mutex
-	// is acquired or the wait is aborted.
-	SleepFinish(success bool)
-
-	// Interrupted returns true if the wait is aborted.
-	Interrupted() bool
-}
+type Sleeper = context.ChannelSleeper
 
 // NoopSleeper is a stateless no-op implementation of Sleeper for anonymous
 // embedding in other types that do not support cancelation.
-type NoopSleeper struct{}
-
-// SleepStart implements Sleeper.SleepStart.
-func (NoopSleeper) SleepStart() <-chan struct{} {
-	return nil
-}
-
-// SleepFinish implements Sleeper.SleepFinish.
-func (NoopSleeper) SleepFinish(success bool) {}
-
-// Interrupted implements Sleeper.Interrupted.
-func (NoopSleeper) Interrupted() bool { return false }
+type NoopSleeper = context.Context
 
 // Block blocks until either receiving from ch succeeds (in which case it
 // returns nil) or sleeper is interrupted (in which case it returns
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index c8ee0c3b1..069d0395d 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -21,10 +21,15 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-// DecodeProgram translates an array of BPF instructions into text format.
-func DecodeProgram(program []linux.BPFInstruction) (string, error) {
+// DecodeProgram translates a compiled BPF program into text format.
+func DecodeProgram(p Program) (string, error) {
+	return DecodeInstructions(p.instructions)
+}
+
+// DecodeInstructions translates an array of BPF instructions into text format.
+func DecodeInstructions(instns []linux.BPFInstruction) (string, error) {
 	var ret bytes.Buffer
-	for line, s := range program {
+	for line, s := range instns {
 		ret.WriteString(fmt.Sprintf("%v: ", line))
 		if err := decode(s, line, &ret); err != nil {
 			return "", err
@@ -34,7 +39,7 @@ func DecodeProgram(program []linux.BPFInstruction) (string, error) {
 	return ret.String(), nil
 }
 
-// Decode translates BPF instruction into text format.
+// Decode translates a single BPF instruction into text format.
 func Decode(inst linux.BPFInstruction) (string, error) {
 	var ret bytes.Buffer
 	err := decode(inst, -1, &ret)
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index 6a023f0c0..bb971ce21 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -93,7 +93,7 @@ func TestDecode(t *testing.T) {
 	}
 }
 
-func TestDecodeProgram(t *testing.T) {
+func TestDecodeInstructions(t *testing.T) {
 	for _, test := range []struct {
 		name     string
 		program  []linux.BPFInstruction
@@ -126,7 +126,7 @@ func TestDecodeProgram(t *testing.T) {
 			program: []linux.BPFInstruction{Stmt(Ld+Abs+W, 10), Stmt(Ld+Len+Mem, 0)},
 			fail:    true},
 	} {
-		got, err := DecodeProgram(test.program)
+		got, err := DecodeInstructions(test.program)
 		if test.fail {
 			if err == nil {
 				t.Errorf("%s: Decode(...) failed, expected: 'error', got: %q", test.name, got)
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index 7992044d0..caaf99c83 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -32,13 +32,21 @@ type ProgramBuilder struct {
 	// Maps label names to label objects.
 	labels map[string]*label
 
+	// unusableLabels are labels that are added before being referenced in a
+	// jump. Any labels added this way cannot be referenced later in order to
+	// avoid backwards references.
+	unusableLabels map[string]bool
+
 	// Array of BPF instructions that makes up the program.
 	instructions []linux.BPFInstruction
 }
 
 // NewProgramBuilder creates a new ProgramBuilder instance.
 func NewProgramBuilder() *ProgramBuilder {
-	return &ProgramBuilder{labels: map[string]*label{}}
+	return &ProgramBuilder{
+		labels:         map[string]*label{},
+		unusableLabels: map[string]bool{},
+	}
 }
 
 // label contains information to resolve a label to an offset.
@@ -108,9 +116,12 @@ func (b *ProgramBuilder) AddJumpLabels(code uint16, k uint32, jtLabel, jfLabel s
 func (b *ProgramBuilder) AddLabel(name string) error {
 	l, ok := b.labels[name]
 	if !ok {
-		// This is done to catch jump backwards cases, but it's not strictly wrong
-		// to have unused labels.
-		return fmt.Errorf("Adding a label that hasn't been used is not allowed: %v", name)
+		if _, ok = b.unusableLabels[name]; ok {
+			return fmt.Errorf("label %q already set", name)
+		}
+		// Mark the label as unusable. This is done to catch backwards jumps.
+		b.unusableLabels[name] = true
+		return nil
 	}
 	if l.target != -1 {
 		return fmt.Errorf("label %q target already set: %v", name, l.target)
@@ -141,6 +152,10 @@ func (b *ProgramBuilder) addLabelSource(labelName string, t jmpType) {
 
 func (b *ProgramBuilder) resolveLabels() error {
 	for key, v := range b.labels {
+		if _, ok := b.unusableLabels[key]; ok {
+			return fmt.Errorf("backwards reference detected for label: %q", key)
+		}
+
 		if v.target == -1 {
 			return fmt.Errorf("label target not set: %v", key)
 		}
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 92ca5f4c3..37f684f25 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -26,16 +26,16 @@ func validate(p *ProgramBuilder, expected []linux.BPFInstruction) error {
 	if err != nil {
 		return fmt.Errorf("Instructions() failed: %v", err)
 	}
-	got, err := DecodeProgram(instructions)
+	got, err := DecodeInstructions(instructions)
 	if err != nil {
-		return fmt.Errorf("DecodeProgram('instructions') failed: %v", err)
+		return fmt.Errorf("DecodeInstructions('instructions') failed: %v", err)
 	}
-	expectedDecoded, err := DecodeProgram(expected)
+	expectedDecoded, err := DecodeInstructions(expected)
 	if err != nil {
-		return fmt.Errorf("DecodeProgram('expected') failed: %v", err)
+		return fmt.Errorf("DecodeInstructions('expected') failed: %v", err)
 	}
 	if got != expectedDecoded {
-		return fmt.Errorf("DecodeProgram() failed, expected: %q, got: %q", expectedDecoded, got)
+		return fmt.Errorf("DecodeInstructions() failed, expected: %q, got: %q", expectedDecoded, got)
 	}
 	return nil
 }
@@ -124,10 +124,38 @@ func TestProgramBuilderLabelWithNoInstruction(t *testing.T) {
 	}
 }
 
+// TestProgramBuilderUnusedLabel tests that adding an unused label doesn't
+// cause program generation to fail.
 func TestProgramBuilderUnusedLabel(t *testing.T) {
 	p := NewProgramBuilder()
-	if err := p.AddLabel("unused"); err == nil {
-		t.Errorf("AddLabel(unused) should have failed")
+	p.AddStmt(Ld+Abs+W, 10)
+	p.AddJump(Jmp+Ja, 10, 0, 0)
+
+	expected := []linux.BPFInstruction{
+		Stmt(Ld+Abs+W, 10),
+		Jump(Jmp+Ja, 10, 0, 0),
+	}
+
+	if err := p.AddLabel("unused"); err != nil {
+		t.Errorf("AddLabel(unused) should have succeeded")
+	}
+
+	if err := validate(p, expected); err != nil {
+		t.Errorf("Validate() failed: %v", err)
+	}
+}
+
+// TestProgramBuilderBackwardsReference tests that including a backwards
+// reference to a label in a program causes a failure.
+func TestProgramBuilderBackwardsReference(t *testing.T) {
+	p := NewProgramBuilder()
+	if err := p.AddLabel("bw_label"); err != nil {
+		t.Errorf("failed to add label")
+	}
+	p.AddStmt(Ld+Abs+W, 10)
+	p.AddJumpTrueLabel(Jmp+Jeq+K, 10, "bw_label", 0)
+	if _, err := p.Instructions(); err == nil {
+		t.Errorf("Instructions() should have failed")
 	}
 }
 
diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
index dcd086298..1186f788e 100644
--- a/pkg/buffer/BUILD
+++ b/pkg/buffer/BUILD
@@ -20,14 +20,17 @@ go_library(
     srcs = [
         "buffer.go",
         "buffer_list.go",
+        "pool.go",
         "safemem.go",
         "view.go",
         "view_unsafe.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/context",
         "//pkg/log",
         "//pkg/safemem",
+        "//pkg/usermem",
     ],
 )
 
@@ -35,9 +38,13 @@ go_test(
     name = "buffer_test",
     size = "small",
     srcs = [
+        "pool_test.go",
         "safemem_test.go",
         "view_test.go",
     ],
     library = ":buffer",
-    deps = ["//pkg/safemem"],
+    deps = [
+        "//pkg/safemem",
+        "//pkg/state",
+    ],
 )
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
index c6d089fd9..311808ae9 100644
--- a/pkg/buffer/buffer.go
+++ b/pkg/buffer/buffer.go
@@ -14,36 +14,26 @@
 
 // Package buffer provides the implementation of a buffer view.
 //
-// A view is an flexible buffer, backed by a pool, supporting the safecopy
-// operations natively as well as the ability to grow via either prepend or
-// append, as well as shrink.
+// A view is an flexible buffer, supporting the safecopy operations natively as
+// well as the ability to grow via either prepend or append, as well as shrink.
 package buffer
 
-import (
-	"sync"
-)
-
-const bufferSize = 8144 // See below.
-
 // buffer encapsulates a queueable byte buffer.
 //
-// Note that the total size is slightly less than two pages. This is done
-// intentionally to ensure that the buffer object aligns with runtime
-// internals. We have no hard size or alignment requirements. This two page
-// size will effectively minimize internal fragmentation, but still have a
-// large enough chunk to limit excessive segmentation.
-//
 // +stateify savable
 type buffer struct {
-	data  [bufferSize]byte
+	data  []byte
 	read  int
 	write int
 	bufferEntry
 }
 
-// reset resets internal data.
-//
-// This must be called before returning the buffer to the pool.
+// init performs in-place initialization for zero value.
+func (b *buffer) init(size int) {
+	b.data = make([]byte, size)
+}
+
+// Reset resets read and write locations, effectively emptying the buffer.
 func (b *buffer) Reset() {
 	b.read = 0
 	b.write = 0
@@ -85,10 +75,3 @@ func (b *buffer) WriteMove(n int) {
 func (b *buffer) WriteSlice() []byte {
 	return b.data[b.write:]
 }
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
diff --git a/pkg/buffer/pool.go b/pkg/buffer/pool.go
new file mode 100644
index 000000000..7ad6132ab
--- /dev/null
+++ b/pkg/buffer/pool.go
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+const (
+	// embeddedCount is the number of buffer structures embedded in the pool. It
+	// is also the number for overflow allocations.
+	embeddedCount = 8
+
+	// defaultBufferSize is the default size for each underlying storage buffer.
+	//
+	// It is slightly less than two pages. This is done intentionally to ensure
+	// that the buffer object aligns with runtime internals. This two page size
+	// will effectively minimize internal fragmentation, but still have a large
+	// enough chunk to limit excessive segmentation.
+	defaultBufferSize = 8144
+)
+
+// pool allocates buffer.
+//
+// It contains an embedded buffer storage for fast path when the number of
+// buffers needed is small.
+//
+// +stateify savable
+type pool struct {
+	bufferSize      int
+	avail           []buffer              `state:"nosave"`
+	embeddedStorage [embeddedCount]buffer `state:"wait"`
+}
+
+// get gets a new buffer from p.
+func (p *pool) get() *buffer {
+	if p.avail == nil {
+		p.avail = p.embeddedStorage[:]
+	}
+	if len(p.avail) == 0 {
+		p.avail = make([]buffer, embeddedCount)
+	}
+	if p.bufferSize <= 0 {
+		p.bufferSize = defaultBufferSize
+	}
+	buf := &p.avail[0]
+	buf.init(p.bufferSize)
+	p.avail = p.avail[1:]
+	return buf
+}
+
+// put releases buf.
+func (p *pool) put(buf *buffer) {
+	// Remove reference to the underlying storage, allowing it to be garbage
+	// collected.
+	buf.data = nil
+}
+
+// setBufferSize sets the size of underlying storage buffer for future
+// allocations. It can be called at any time.
+func (p *pool) setBufferSize(size int) {
+	p.bufferSize = size
+}
+
+// afterLoad is invoked by stateify.
+func (p *pool) afterLoad() {
+	// S/R does not save subslice into embeddedStorage correctly. Restore
+	// available portion of embeddedStorage manually. Restore as nil if none used.
+	for i := len(p.embeddedStorage); i > 0; i-- {
+		if p.embeddedStorage[i-1].data != nil {
+			p.avail = p.embeddedStorage[i:]
+			break
+		}
+	}
+}
diff --git a/pkg/buffer/pool_test.go b/pkg/buffer/pool_test.go
new file mode 100644
index 000000000..8584bac89
--- /dev/null
+++ b/pkg/buffer/pool_test.go
@@ -0,0 +1,51 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"testing"
+)
+
+func TestGetDefaultBufferSize(t *testing.T) {
+	var p pool
+	for i := 0; i < embeddedCount*2; i++ {
+		buf := p.get()
+		if got, want := len(buf.data), defaultBufferSize; got != want {
+			t.Errorf("#%d len(buf.data) = %d, want %d", i, got, want)
+		}
+	}
+}
+
+func TestGetCustomBufferSize(t *testing.T) {
+	const size = 100
+
+	var p pool
+	p.setBufferSize(size)
+	for i := 0; i < embeddedCount*2; i++ {
+		buf := p.get()
+		if got, want := len(buf.data), size; got != want {
+			t.Errorf("#%d len(buf.data) = %d, want %d", i, got, want)
+		}
+	}
+}
+
+func TestPut(t *testing.T) {
+	var p pool
+	buf := p.get()
+	p.put(buf)
+	if buf.data != nil {
+		t.Errorf("buf.data = %x, want nil", buf.data)
+	}
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
index b789e56e9..8b42575b4 100644
--- a/pkg/buffer/safemem.go
+++ b/pkg/buffer/safemem.go
@@ -44,7 +44,7 @@ func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, e
 	// Need at least one buffer.
 	firstBuf := v.data.Back()
 	if firstBuf == nil {
-		firstBuf = bufferPool.Get().(*buffer)
+		firstBuf = v.pool.get()
 		v.data.PushBack(firstBuf)
 	}
 
@@ -56,7 +56,7 @@ func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, e
 		count -= l
 		blocks = append(blocks, firstBuf.WriteBlock())
 		for count > 0 {
-			emptyBuf := bufferPool.Get().(*buffer)
+			emptyBuf := v.pool.get()
 			v.data.PushBack(emptyBuf)
 			block := emptyBuf.WriteBlock().TakeFirst64(count)
 			count -= uint64(block.Len())
diff --git a/pkg/buffer/safemem_test.go b/pkg/buffer/safemem_test.go
index 47f357e0c..721cc5934 100644
--- a/pkg/buffer/safemem_test.go
+++ b/pkg/buffer/safemem_test.go
@@ -23,6 +23,8 @@ import (
 )
 
 func TestSafemem(t *testing.T) {
+	const bufferSize = defaultBufferSize
+
 	testCases := []struct {
 		name    string
 		input   string
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
index e6901eadb..00652d675 100644
--- a/pkg/buffer/view.go
+++ b/pkg/buffer/view.go
@@ -27,6 +27,7 @@ import (
 type View struct {
 	data bufferList
 	size int64
+	pool pool
 }
 
 // TrimFront removes the first count bytes from the buffer.
@@ -81,7 +82,7 @@ func (v *View) advanceRead(count int64) {
 		buf = buf.Next() // Iterate.
 		v.data.Remove(oldBuf)
 		oldBuf.Reset()
-		bufferPool.Put(oldBuf)
+		v.pool.put(oldBuf)
 
 		// Update counts.
 		count -= sz
@@ -118,7 +119,7 @@ func (v *View) Truncate(length int64) {
 		// Drop the buffer completely; see above.
 		v.data.Remove(buf)
 		buf.Reset()
-		bufferPool.Put(buf)
+		v.pool.put(buf)
 		v.size -= sz
 	}
 }
@@ -137,7 +138,7 @@ func (v *View) Grow(length int64, zero bool) {
 
 		// Is there some space in the last buffer?
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
@@ -181,7 +182,7 @@ func (v *View) Prepend(data []byte) {
 
 	for len(data) > 0 {
 		// Do we need an empty buffer?
-		buf := bufferPool.Get().(*buffer)
+		buf := v.pool.get()
 		v.data.PushFront(buf)
 
 		// The buffer is empty; copy last chunk.
@@ -211,7 +212,7 @@ func (v *View) Append(data []byte) {
 
 		// Ensure there's a buffer with space.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
@@ -297,7 +298,7 @@ func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
 
 		// Ensure we have an empty buffer.
 		if buf == nil || buf.Full() {
-			buf = bufferPool.Get().(*buffer)
+			buf = v.pool.get()
 			v.data.PushBack(buf)
 		}
 
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
index 3db1bc6ee..839af0223 100644
--- a/pkg/buffer/view_test.go
+++ b/pkg/buffer/view_test.go
@@ -16,11 +16,16 @@ package buffer
 
 import (
 	"bytes"
+	"context"
 	"io"
 	"strings"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/state"
 )
 
+const bufferSize = defaultBufferSize
+
 func fillAppend(v *View, data []byte) {
 	v.Append(data)
 }
@@ -50,6 +55,30 @@ var fillFuncs = map[string]func(*View, []byte){
 	"writeFromReaderEnd": fillWriteFromReaderEnd,
 }
 
+func BenchmarkReadAt(b *testing.B) {
+	b.ReportAllocs()
+	var v View
+	v.Append(make([]byte, 100))
+
+	buf := make([]byte, 10)
+	for i := 0; i < b.N; i++ {
+		v.ReadAt(buf, 0)
+	}
+}
+
+func BenchmarkWriteRead(b *testing.B) {
+	b.ReportAllocs()
+	var v View
+	sz := 1000
+	wbuf := make([]byte, sz)
+	rbuf := bytes.NewBuffer(make([]byte, sz))
+	for i := 0; i < b.N; i++ {
+		v.Append(wbuf)
+		rbuf.Reset()
+		v.ReadToWriter(rbuf, int64(sz))
+	}
+}
+
 func testReadAt(t *testing.T, v *View, offset int64, n int, wantStr string, wantErr error) {
 	t.Helper()
 	d := make([]byte, n)
@@ -465,3 +494,51 @@ func TestView(t *testing.T) {
 		}
 	}
 }
+
+func doSaveAndLoad(t *testing.T, toSave, toLoad *View) {
+	t.Helper()
+	var buf bytes.Buffer
+	ctx := context.Background()
+	if _, err := state.Save(ctx, &buf, toSave); err != nil {
+		t.Fatal("state.Save:", err)
+	}
+	if _, err := state.Load(ctx, bytes.NewReader(buf.Bytes()), toLoad); err != nil {
+		t.Fatal("state.Load:", err)
+	}
+}
+
+func TestSaveRestoreViewEmpty(t *testing.T) {
+	var toSave View
+	var v View
+	doSaveAndLoad(t, &toSave, &v)
+
+	if got := v.pool.avail; got != nil {
+		t.Errorf("pool is not in zero state: v.pool.avail = %v, want nil", got)
+	}
+	if got := v.Flatten(); len(got) != 0 {
+		t.Errorf("v.Flatten() = %x, want []", got)
+	}
+}
+
+func TestSaveRestoreView(t *testing.T) {
+	// Create data that fits 2.5 slots.
+	data := bytes.Join([][]byte{
+		bytes.Repeat([]byte{1, 2}, defaultBufferSize),
+		bytes.Repeat([]byte{3}, defaultBufferSize/2),
+	}, nil)
+
+	var toSave View
+	toSave.Append(data)
+
+	var v View
+	doSaveAndLoad(t, &toSave, &v)
+
+	// Next available slot at index 3; 0-2 slot are used.
+	i := 3
+	if got, want := &v.pool.avail[0], &v.pool.embeddedStorage[i]; got != want {
+		t.Errorf("next available buffer points to %p, want %p (&v.pool.embeddedStorage[%d])", got, want, i)
+	}
+	if got := v.Flatten(); !bytes.Equal(got, data) {
+		t.Errorf("v.Flatten() = %x, want %x", got, data)
+	}
+}
diff --git a/pkg/context/BUILD b/pkg/context/BUILD
index 239f31149..f33e23bf7 100644
--- a/pkg/context/BUILD
+++ b/pkg/context/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = ["context.go"],
     visibility = ["//:sandbox"],
     deps = [
-        "//pkg/amutex",
         "//pkg/log",
     ],
 )
diff --git a/pkg/context/context.go b/pkg/context/context.go
index 5319b6d8d..f3031fc60 100644
--- a/pkg/context/context.go
+++ b/pkg/context/context.go
@@ -26,7 +26,6 @@ import (
 	"context"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/log"
 )
 
@@ -68,9 +67,10 @@ func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
 // In both cases, values extracted from the Context should be used instead.
 type Context interface {
 	log.Logger
-	amutex.Sleeper
 	context.Context
 
+	ChannelSleeper
+
 	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
 	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
 	// is true and the Context represents a Task, the Task's AddressSpace is
@@ -85,29 +85,60 @@ type Context interface {
 	UninterruptibleSleepFinish(activate bool)
 }
 
-// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep
-// methods for anonymous embedding in other types that do not implement sleeps.
-type NoopSleeper struct {
-	amutex.NoopSleeper
+// A ChannelSleeper represents a goroutine that may sleep interruptibly, where
+// interruption is indicated by a channel becoming readable.
+type ChannelSleeper interface {
+	// SleepStart is called before going to sleep interruptibly. If SleepStart
+	// returns a non-nil channel and that channel becomes ready for receiving
+	// while the goroutine is sleeping, the goroutine should be woken, and
+	// SleepFinish(false) should be called. Otherwise, SleepFinish(true) should
+	// be called after the goroutine stops sleeping.
+	SleepStart() <-chan struct{}
+
+	// SleepFinish is called after an interruptibly-sleeping goroutine stops
+	// sleeping, as documented by SleepStart.
+	SleepFinish(success bool)
+
+	// Interrupted returns true if the channel returned by SleepStart is
+	// ready for receiving.
+	Interrupted() bool
 }
 
-// UninterruptibleSleepStart does nothing.
-func (NoopSleeper) UninterruptibleSleepStart(bool) {}
+// NoopSleeper is a noop implementation of ChannelSleeper and
+// Context.UninterruptibleSleep* methods for anonymous embedding in other types
+// that do not implement special behavior around sleeps.
+type NoopSleeper struct{}
 
-// UninterruptibleSleepFinish does nothing.
-func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
+// SleepStart implements ChannelSleeper.SleepStart.
+func (NoopSleeper) SleepStart() <-chan struct{} {
+	return nil
+}
 
-// Deadline returns zero values, meaning no deadline.
+// SleepFinish implements ChannelSleeper.SleepFinish.
+func (NoopSleeper) SleepFinish(success bool) {}
+
+// Interrupted implements ChannelSleeper.Interrupted.
+func (NoopSleeper) Interrupted() bool {
+	return false
+}
+
+// UninterruptibleSleepStart implements Context.UninterruptibleSleepStart.
+func (NoopSleeper) UninterruptibleSleepStart(deactivate bool) {}
+
+// UninterruptibleSleepFinish implements Context.UninterruptibleSleepFinish.
+func (NoopSleeper) UninterruptibleSleepFinish(activate bool) {}
+
+// Deadline implements context.Context.Deadline.
 func (NoopSleeper) Deadline() (time.Time, bool) {
 	return time.Time{}, false
 }
 
-// Done returns nil.
+// Done implements context.Context.Done.
 func (NoopSleeper) Done() <-chan struct{} {
 	return nil
 }
 
-// Err returns nil.
+// Err returns context.Context.Err.
 func (NoopSleeper) Err() error {
 	return nil
 }
@@ -135,3 +166,27 @@ var bgContext = &logContext{Logger: log.Log()}
 func Background() Context {
 	return bgContext
 }
+
+// WithValue returns a copy of parent in which the value associated with key is
+// val.
+func WithValue(parent Context, key, val interface{}) Context {
+	return &withValue{
+		Context: parent,
+		key:     key,
+		val:     val,
+	}
+}
+
+type withValue struct {
+	Context
+	key interface{}
+	val interface{}
+}
+
+// Value implements Context.Value.
+func (ctx *withValue) Value(key interface{}) interface{} {
+	if key == ctx.key {
+		return ctx.val
+	}
+	return ctx.Context.Value(key)
+}
diff --git a/pkg/coverage/BUILD b/pkg/coverage/BUILD
new file mode 100644
index 000000000..a198e8028
--- /dev/null
+++ b/pkg/coverage/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "coverage",
+    srcs = ["coverage.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/usermem",
+        "@io_bazel_rules_go//go/tools/coverdata",
+    ],
+)
diff --git a/pkg/coverage/coverage.go b/pkg/coverage/coverage.go
new file mode 100644
index 000000000..a4f4b2c5e
--- /dev/null
+++ b/pkg/coverage/coverage.go
@@ -0,0 +1,172 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package coverage provides an interface through which Go coverage data can
+// be collected, converted to kcov format, and exposed to userspace.
+//
+// Coverage can be enabled by calling bazel {build,test} with
+// --collect_coverage_data and --instrumentation_filter with the desired
+// coverage surface. This causes bazel to use the Go cover tool manually to
+// generate instrumented files. It injects a hook that registers all coverage
+// data with the coverdata package.
+package coverage
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+
+	"github.com/bazelbuild/rules_go/go/tools/coverdata"
+)
+
+// KcovAvailable returns whether the kcov coverage interface is available. It is
+// available as long as coverage is enabled for some files.
+func KcovAvailable() bool {
+	return len(coverdata.Cover.Blocks) > 0
+}
+
+// coverageMu must be held while accessing coverdata.Cover. This prevents
+// concurrent reads/writes from multiple threads collecting coverage data.
+var coverageMu sync.RWMutex
+
+// once ensures that globalData is only initialized once.
+var once sync.Once
+
+var globalData struct {
+	// files is the set of covered files sorted by filename. It is calculated at
+	// startup.
+	files []string
+
+	// syntheticPCs are a set of PCs calculated at startup, where the PC
+	// at syntheticPCs[i][j] corresponds to file i, block j.
+	syntheticPCs [][]uint64
+}
+
+// ClearCoverageData clears existing coverage data.
+func ClearCoverageData() {
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+	for _, counters := range coverdata.Cover.Counters {
+		for index := 0; index < len(counters); index++ {
+			atomic.StoreUint32(&counters[index], 0)
+		}
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// ConsumeCoverageData builds and writes the collection of covered PCs. It
+// returns the number of bytes written.
+//
+// In Linux, a kernel configuration is set that compiles the kernel with a
+// custom function that is called at the beginning of every basic block, which
+// updates the memory-mapped coverage information. The Go coverage tool does not
+// allow us to inject arbitrary instructions into basic blocks, but it does
+// provide data that we can convert to a kcov-like format and transfer them to
+// userspace through a memory mapping.
+//
+// Note that this is not a strict implementation of kcov, which is especially
+// tricky to do because we do not have the same coverage tools available in Go
+// that that are available for the actual Linux kernel. In Linux, a kernel
+// configuration is set that compiles the kernel with a custom function that is
+// called at the beginning of every basic block to write program counters to the
+// kcov memory mapping. In Go, however, coverage tools only give us a count of
+// basic blocks as they are executed. Every time we return to userspace, we
+// collect the coverage information and write out PCs for each block that was
+// executed, providing userspace with the illusion that the kcov data is always
+// up to date. For convenience, we also generate a unique synthetic PC for each
+// block instead of using actual PCs. Finally, we do not provide thread-specific
+// coverage data (each kcov instance only contains PCs executed by the thread
+// owning it); instead, we will supply data for any file specified by --
+// instrumentation_filter.
+//
+// Note that we "consume", i.e. clear, coverdata when this function is run, to
+// ensure that each event is only reported once. Due to the limitations of Go
+// coverage tools, we reset the global coverage data every time this function is
+// run.
+func ConsumeCoverageData(w io.Writer) int {
+	once.Do(initCoverageData)
+
+	coverageMu.Lock()
+	defer coverageMu.Unlock()
+
+	total := 0
+	var pcBuffer [8]byte
+	for fileIndex, file := range globalData.files {
+		counters := coverdata.Cover.Counters[file]
+		for index := 0; index < len(counters); index++ {
+			if atomic.LoadUint32(&counters[index]) == 0 {
+				continue
+			}
+			// Non-zero coverage data found; consume it and report as a PC.
+			atomic.StoreUint32(&counters[index], 0)
+			pc := globalData.syntheticPCs[fileIndex][index]
+			usermem.ByteOrder.PutUint64(pcBuffer[:], pc)
+			n, err := w.Write(pcBuffer[:])
+			if err != nil {
+				if err == io.EOF {
+					// Simply stop writing if we encounter EOF; it's ok if we attempted to
+					// write more than we can hold.
+					return total + n
+				}
+				panic(fmt.Sprintf("Internal error writing PCs to kcov area: %v", err))
+			}
+			total += n
+		}
+	}
+
+	if total == 0 {
+		// An empty profile indicates that coverage is not enabled, in which case
+		// there shouldn't be any task work registered.
+		panic("kcov task work is registered, but no coverage data was found")
+	}
+	return total
+}
+
+// initCoverageData initializes globalData. It should only be called once,
+// before any kcov data is written.
+func initCoverageData() {
+	// First, order all files. Then calculate synthetic PCs for every block
+	// (using the well-defined ordering for files as well).
+	for file := range coverdata.Cover.Blocks {
+		globalData.files = append(globalData.files, file)
+	}
+	sort.Strings(globalData.files)
+
+	// nextSyntheticPC is the first PC that we generate for a block.
+	//
+	// This uses a standard-looking kernel range for simplicity.
+	//
+	// FIXME(b/160639712): This is only necessary because syzkaller requires
+	// addresses in the kernel range. If we can remove this constraint, then we
+	// should be able to use the actual addresses.
+	var nextSyntheticPC uint64 = 0xffffffff80000000
+	for _, file := range globalData.files {
+		blocks := coverdata.Cover.Blocks[file]
+		thisFile := make([]uint64, 0, len(blocks))
+		for range blocks {
+			thisFile = append(thisFile, nextSyntheticPC)
+			nextSyntheticPC++ // Advance.
+		}
+		globalData.syntheticPCs = append(globalData.syntheticPCs, thisFile)
+	}
+}
diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
index c9bd40e1b..e4ae0d689 100644
--- a/pkg/cpuid/cpuid_parse_x86_test.go
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -32,27 +32,27 @@ func kernelVersion() (int, int, error) {
 		return 0, 0, err
 	}
 
-	var r string
+	var sb strings.Builder
 	for _, b := range u.Release {
 		if b == 0 {
 			break
 		}
-		r += string(b)
+		sb.WriteByte(byte(b))
 	}
 
-	s := strings.Split(r, ".")
+	s := strings.Split(sb.String(), ".")
 	if len(s) < 2 {
-		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", r)
+		return 0, 0, fmt.Errorf("kernel release missing major and minor component: %s", sb.String())
 	}
 
 	major, err := strconv.Atoi(s[0])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %v", s[0], r, err)
+		return 0, 0, fmt.Errorf("error parsing major version %q in %q: %w", s[0], sb.String(), err)
 	}
 
 	minor, err := strconv.Atoi(s[1])
 	if err != nil {
-		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %v", s[1], r, err)
+		return 0, 0, fmt.Errorf("error parsing minor version %q in %q: %w", s[1], sb.String(), err)
 	}
 
 	return major, minor, nil
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index bee28b68d..a493e3407 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "eventchannel",
     srcs = [
         "event.go",
+        "event_any.go",
         "rate.go",
     ],
     visibility = ["//:sandbox"],
@@ -14,8 +15,9 @@ go_library(
         "//pkg/log",
         "//pkg/sync",
         "//pkg/unet",
-        "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_golang_protobuf//ptypes:go_default_library_gen",
+        "@org_golang_google_protobuf//encoding/prototext:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//types/known/anypb:go_default_library",
         "@org_golang_x_time//rate:go_default_library",
     ],
 )
@@ -32,6 +34,6 @@ go_test(
     library = ":eventchannel",
     deps = [
         "//pkg/sync",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index 9a29c58bd..7172ce75d 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -24,8 +24,8 @@ import (
 	"fmt"
 	"syscall"
 
-	"github.com/golang/protobuf/proto"
-	"github.com/golang/protobuf/ptypes"
+	"google.golang.org/protobuf/encoding/prototext"
+	"google.golang.org/protobuf/proto"
 	pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -118,22 +118,6 @@ func (me *multiEmitter) Close() error {
 	return err
 }
 
-func marshal(msg proto.Message) ([]byte, error) {
-	anypb, err := ptypes.MarshalAny(msg)
-	if err != nil {
-		return nil, err
-	}
-
-	// Wire format is uvarint message length followed by binary proto.
-	bufMsg, err := proto.Marshal(anypb)
-	if err != nil {
-		return nil, err
-	}
-	p := make([]byte, binary.MaxVarintLen64)
-	n := binary.PutUvarint(p, uint64(len(bufMsg)))
-	return append(p[:n], bufMsg...), nil
-}
-
 // socketEmitter emits proto messages on a socket.
 type socketEmitter struct {
 	socket *unet.Socket
@@ -155,10 +139,19 @@ func SocketEmitter(fd int) (Emitter, error) {
 
 // Emit implements Emitter.Emit.
 func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
-	p, err := marshal(msg)
+	any, err := newAny(msg)
 	if err != nil {
 		return false, err
 	}
+	bufMsg, err := proto.Marshal(any)
+	if err != nil {
+		return false, err
+	}
+
+	// Wire format is uvarint message length followed by binary proto.
+	p := make([]byte, binary.MaxVarintLen64)
+	n := binary.PutUvarint(p, uint64(len(bufMsg)))
+	p = append(p[:n], bufMsg...)
 	for done := 0; done < len(p); {
 		n, err := s.socket.Write(p[done:])
 		if err != nil {
@@ -166,6 +159,7 @@ func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
 		}
 		done += n
 	}
+
 	return false, nil
 }
 
@@ -189,9 +183,13 @@ func DebugEmitterFrom(inner Emitter) Emitter {
 }
 
 func (d *debugEmitter) Emit(msg proto.Message) (bool, error) {
+	text, err := prototext.Marshal(msg)
+	if err != nil {
+		return false, err
+	}
 	ev := &pb.DebugEvent{
-		Name: proto.MessageName(msg),
-		Text: proto.MarshalTextString(msg),
+		Name: string(msg.ProtoReflect().Descriptor().FullName()),
+		Text: string(text),
 	}
 	return d.inner.Emit(ev)
 }
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index 34468f072..4b24ac47c 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -16,7 +16,7 @@ syntax = "proto3";
 
 package gvisor;
 
-// A debug event encapsulates any other event protobuf in text format. This is
+// DebugEvent encapsulates any other event protobuf in text format. This is
 // useful because clients reading events emitted this way do not need to link
 // the event protobufs to display them in a human-readable format.
 message DebugEvent {
diff --git a/pkg/eventchannel/event_any.go b/pkg/eventchannel/event_any.go
new file mode 100644
index 000000000..a5549f6cd
--- /dev/null
+++ b/pkg/eventchannel/event_any.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventchannel
+
+import (
+	"google.golang.org/protobuf/types/known/anypb"
+
+	"google.golang.org/protobuf/proto"
+)
+
+func newAny(m proto.Message) (*anypb.Any, error) {
+	return anypb.New(m)
+}
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 43750360b..0dd408f76 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -19,7 +19,7 @@ import (
 	"testing"
 	"time"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
diff --git a/pkg/eventchannel/rate.go b/pkg/eventchannel/rate.go
index 179226c92..74960e16a 100644
--- a/pkg/eventchannel/rate.go
+++ b/pkg/eventchannel/rate.go
@@ -15,8 +15,8 @@
 package eventchannel
 
 import (
-	"github.com/golang/protobuf/proto"
 	"golang.org/x/time/rate"
+	"google.golang.org/protobuf/proto"
 )
 
 // rateLimitedEmitter wraps an emitter and limits events to the given limits.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 83bcfe220..cc6b0cdf1 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -49,7 +49,7 @@ func fixCount(n int, err error) (int, error) {
 
 // Read implements io.Reader.
 func (r *ReadWriter) Read(b []byte) (int, error) {
-	c, err := fixCount(syscall.Read(int(atomic.LoadInt64(&r.fd)), b))
+	c, err := fixCount(syscall.Read(r.FD(), b))
 	if c == 0 && len(b) > 0 && err == nil {
 		return 0, io.EOF
 	}
@@ -62,7 +62,7 @@ func (r *ReadWriter) Read(b []byte) (int, error) {
 func (r *ReadWriter) ReadAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pread(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pread(r.FD(), b, off))
 		if m == 0 && err == nil {
 			return c, io.EOF
 		}
@@ -82,7 +82,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 	var n, remaining int
 	for remaining = len(b); remaining > 0; {
 		woff := len(b) - remaining
-		n, err = syscall.Write(int(atomic.LoadInt64(&r.fd)), b[woff:])
+		n, err = syscall.Write(r.FD(), b[woff:])
 
 		if n > 0 {
 			// syscall.Write wrote some bytes. This is the common case.
@@ -110,7 +110,7 @@ func (r *ReadWriter) Write(b []byte) (int, error) {
 func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	for len(b) > 0 {
 		var m int
-		m, err = fixCount(syscall.Pwrite(int(atomic.LoadInt64(&r.fd)), b, off))
+		m, err = fixCount(syscall.Pwrite(r.FD(), b, off))
 		if err != nil {
 			break
 		}
@@ -121,6 +121,16 @@ func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) {
 	return
 }
 
+// FD returns the owned file descriptor. Ownership remains unchanged.
+func (r *ReadWriter) FD() int {
+	return int(atomic.LoadInt64(&r.fd))
+}
+
+// String implements Stringer.String().
+func (r *ReadWriter) String() string {
+	return fmt.Sprintf("FD: %d", r.FD())
+}
+
 // FD owns a host file descriptor.
 //
 // It is similar to os.File, with a few important distinctions:
@@ -167,6 +177,23 @@ func NewFromFile(file *os.File) (*FD, error) {
 	return New(fd), nil
 }
 
+// NewFromFiles creates new FDs for each file in the slice.
+func NewFromFiles(files []*os.File) ([]*FD, error) {
+	rv := make([]*FD, 0, len(files))
+	for _, f := range files {
+		new, err := NewFromFile(f)
+		if err != nil {
+			// Cleanup on error.
+			for _, fd := range rv {
+				fd.Close()
+			}
+			return nil, err
+		}
+		rv = append(rv, new)
+	}
+	return rv, nil
+}
+
 // Open is equivalent to open(2).
 func Open(path string, openmode int, perm uint32) (*FD, error) {
 	f, err := syscall.Open(path, openmode|syscall.O_LARGEFILE, perm)
@@ -204,11 +231,6 @@ func (f *FD) Release() int {
 	return int(atomic.SwapInt64(&f.fd, -1))
 }
 
-// FD returns the file descriptor owned by FD. FD retains ownership.
-func (f *FD) FD() int {
-	return int(atomic.LoadInt64(&f.fd))
-}
-
 // File converts the FD to an os.File.
 //
 // FD does not transfer ownership of the file descriptor (it will be
@@ -219,7 +241,7 @@ func (f *FD) FD() int {
 // This operation is somewhat expensive, so care should be taken to minimize
 // its use.
 func (f *FD) File() (*os.File, error) {
-	fd, err := syscall.Dup(int(atomic.LoadInt64(&f.fd)))
+	fd, err := syscall.Dup(f.FD())
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 4225b04dd..ec2f997a2 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -65,8 +65,7 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 
 // epollWait performs a blocking wait on epfd.
 //
-// Preconditions:
-//  * len(events) > 0
+// Preconditions: len(events) > 0
 func epollWait(epfd int, events []syscall.EpollEvent, msec int) (int, error) {
 	if len(events) == 0 {
 		panic("Empty events passed to EpollWait")
diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go
index ec742c091..c4a3366ce 100644
--- a/pkg/flipcall/flipcall.go
+++ b/pkg/flipcall/flipcall.go
@@ -179,8 +179,10 @@ const (
 
 // Connect blocks until the peer Endpoint has called Endpoint.RecvFirst().
 //
-// Preconditions: ep is a client Endpoint. ep.Connect(), ep.RecvFirst(),
-// ep.SendRecv(), and ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a client Endpoint.
+// * ep.Connect(), ep.RecvFirst(), ep.SendRecv(), and ep.SendLast() have never
+//   been called.
 func (ep *Endpoint) Connect() error {
 	err := ep.ctrlConnect()
 	if err == nil {
@@ -192,8 +194,9 @@ func (ep *Endpoint) Connect() error {
 // RecvFirst blocks until the peer Endpoint calls Endpoint.SendRecv(), then
 // returns the datagram length specified by that call.
 //
-// Preconditions: ep is a server Endpoint. ep.SendRecv(), ep.RecvFirst(), and
-// ep.SendLast() have never been called.
+// Preconditions:
+// * ep is a server Endpoint.
+// * ep.SendRecv(), ep.RecvFirst(), and ep.SendLast() have never been called.
 func (ep *Endpoint) RecvFirst() (uint32, error) {
 	if err := ep.ctrlWaitFirst(); err != nil {
 		return 0, err
@@ -211,10 +214,12 @@ func (ep *Endpoint) RecvFirst() (uint32, error) {
 // datagram length, then blocks until the peer Endpoint calls
 // Endpoint.SendRecv() or Endpoint.SendLast().
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
@@ -240,10 +245,12 @@ func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
 // SendLast causes the peer Endpoint's call to Endpoint.SendRecv() or
 // Endpoint.RecvFirst() to return with the given datagram length.
 //
-// Preconditions: dataLen <= ep.DataCap(). No previous call to ep.SendRecv() or
-// ep.RecvFirst() has returned an error. ep.SendLast() has never been called.
-// If ep is a client Endpoint, ep.Connect() has previously been called and
-// returned nil.
+// Preconditions:
+// * dataLen <= ep.DataCap().
+// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
+// * ep.SendLast() has never been called.
+// * If ep is a client Endpoint, ep.Connect() has previously been called and
+//   returned nil.
 func (ep *Endpoint) SendLast(dataLen uint32) error {
 	if dataLen > ep.dataCap {
 		panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
diff --git a/pkg/lisafs/README.md b/pkg/lisafs/README.md
new file mode 100644
index 000000000..51d0d40e5
--- /dev/null
+++ b/pkg/lisafs/README.md
@@ -0,0 +1,363 @@
+# Replacing 9P
+
+## Background
+
+The Linux filesystem model consists of the following key aspects (modulo mounts,
+which are outside the scope of this discussion):
+
+-   A `struct inode` represents a "filesystem object", such as a directory or a
+    regular file. "Filesystem object" is most precisely defined by the practical
+    properties of an inode, such as an immutable type (regular file, directory,
+    symbolic link, etc.) and its independence from the path originally used to
+    obtain it.
+
+-   A `struct dentry` represents a node in a filesystem tree. Semantically, each
+    dentry is immutably associated with an inode representing the filesystem
+    object at that position. (Linux implements optimizations involving reuse of
+    unreferenced dentries, which allows their associated inodes to change, but
+    this is outside the scope of this discussion.)
+
+-   A `struct file` represents an open file description (hereafter FD) and is
+    needed to perform I/O. Each FD is immutably associated with the dentry
+    through which it was opened.
+
+The current gVisor virtual filesystem implementation (hereafter VFS1) closely
+imitates the Linux design:
+
+-   `struct inode` => `fs.Inode`
+
+-   `struct dentry` => `fs.Dirent`
+
+-   `struct file` => `fs.File`
+
+gVisor accesses most external filesystems through a variant of the 9P2000.L
+protocol, including extensions for performance (`walkgetattr`) and for features
+not supported by vanilla 9P2000.L (`flushf`, `lconnect`). The 9P protocol family
+is inode-based; 9P fids represent a file (equivalently "file system object"),
+and the protocol is structured around alternatively obtaining fids to represent
+files (with `walk` and, in gVisor, `walkgetattr`) and performing operations on
+those fids.
+
+In the sections below, a **shared** filesystem is a filesystem that is *mutably*
+accessible by multiple concurrent clients, such that a **non-shared** filesystem
+is a filesystem that is either read-only or accessible by only a single client.
+
+## Problems
+
+### Serialization of Path Component RPCs
+
+Broadly speaking, VFS1 traverses each path component in a pathname, alternating
+between verifying that each traversed dentry represents an inode that represents
+a searchable directory and moving to the next dentry in the path.
+
+In the context of a remote filesystem, the structure of this traversal means
+that - modulo caching - a path involving N components requires at least N-1
+*sequential* RPCs to obtain metadata for intermediate directories, incurring
+significant latency. (In vanilla 9P2000.L, 2(N-1) RPCs are required: N-1 `walk`
+and N-1 `getattr`. We added the `walkgetattr` RPC to reduce this overhead.) On
+non-shared filesystems, this overhead is primarily significant during
+application startup; caching mitigates much of this overhead at steady state. On
+shared filesystems, where correct caching requires revalidation (requiring RPCs
+for each revalidated directory anyway), this overhead is consistently ruinous.
+
+### Inefficient RPCs
+
+9P is not exceptionally economical with RPCs in general. In addition to the
+issue described above:
+
+-   Opening an existing file in 9P involves at least 2 RPCs: `walk` to produce
+    an unopened fid representing the file, and `lopen` to open the fid.
+
+-   Creating a file also involves at least 2 RPCs: `walk` to produce an unopened
+    fid representing the parent directory, and `lcreate` to create the file and
+    convert the fid to an open fid representing the created file. In practice,
+    both the Linux and gVisor 9P clients expect to have an unopened fid for the
+    created file (necessitating an additional `walk`), as well as attributes for
+    the created file (necessitating an additional `getattr`), for a total of 4
+    RPCs. (In a shared filesystem, where whether a file already exists can
+    change between RPCs, a correct implementation of `open(O_CREAT)` would have
+    to alternate between these two paths (plus `clunk`ing the temporary fid
+    between alternations, since the nature of the `fid` differs between the two
+    paths). Neither Linux nor gVisor implement the required alternation, so
+    `open(O_CREAT)` without `O_EXCL` can spuriously fail with `EEXIST` on both.)
+
+-   Closing (`clunk`ing) a fid requires an RPC. VFS1 issues this RPC
+    asynchronously in an attempt to reduce critical path latency, but scheduling
+    overhead makes this not clearly advantageous in practice.
+
+-   `read` and `readdir` can return partial reads without a way to indicate EOF,
+    necessitating an additional final read to detect EOF.
+
+-   Operations that affect filesystem state do not consistently return updated
+    filesystem state. In gVisor, the client implementation attempts to handle
+    this by tracking what it thinks updated state "should" be; this is complex,
+    and especially brittle for timestamps (which are often not arbitrarily
+    settable). In Linux, the client implemtation invalidates cached metadata
+    whenever it performs such an operation, and reloads it when a dentry
+    corresponding to an inode with no valid cached metadata is revalidated; this
+    is simple, but necessitates an additional `getattr`.
+
+### Dentry/Inode Ambiguity
+
+As noted above, 9P's documentation tends to imply that unopened fids represent
+an inode. In practice, most filesystem APIs present very limited interfaces for
+working with inodes at best, such that the interpretation of unopened fids
+varies:
+
+-   Linux's 9P client associates unopened fids with (dentry, uid) pairs. When
+    caching is enabled, it also associates each inode with the first fid opened
+    writably that references that inode, in order to support page cache
+    writeback.
+
+-   gVisor's 9P client associates unopened fids with inodes, and also caches
+    opened fids in inodes in a manner similar to Linux.
+
+-   The runsc fsgofer associates unopened fids with both "dentries" (host
+    filesystem paths) and "inodes" (host file descriptors); which is used
+    depends on the operation invoked on the fid.
+
+For non-shared filesystems, this confusion has resulted in correctness issues
+that are (in gVisor) currently handled by a number of coarse-grained locks that
+serialize renames with all other filesystem operations. For shared filesystems,
+this means inconsistent behavior in the presence of concurrent mutation.
+
+## Design
+
+Almost all Linux filesystem syscalls describe filesystem resources in one of two
+ways:
+
+-   Path-based: A filesystem position is described by a combination of a
+    starting position and a sequence of path components relative to that
+    position, where the starting position is one of:
+
+    -   The VFS root (defined by mount namespace and chroot), for absolute paths
+
+    -   The VFS position of an existing FD, for relative paths passed to `*at`
+        syscalls (e.g. `statat`)
+
+    -   The current working directory, for relative paths passed to non-`*at`
+        syscalls and `*at` syscalls with `AT_FDCWD`
+
+-   File-description-based: A filesystem object is described by an existing FD,
+    passed to a `f*` syscall (e.g. `fstat`).
+
+Many of our issues with 9P arise from its (and VFS') interposition of a model
+based on inodes between the filesystem syscall API and filesystem
+implementations. We propose to replace 9P with a protocol that does not feature
+inodes at all, and instead closely follows the filesystem syscall API by
+featuring only path-based and FD-based operations, with minimal deviations as
+necessary to ameliorate deficiencies in the syscall interface (see below). This
+approach addresses the issues described above:
+
+-   Even on shared filesystems, most application filesystem syscalls are
+    translated to a single RPC (possibly excepting special cases described
+    below), which is a logical lower bound.
+
+-   The behavior of application syscalls on shared filesystems is
+    straightforwardly predictable: path-based syscalls are translated to
+    path-based RPCs, which will re-lookup the file at that path, and FD-based
+    syscalls are translated to FD-based RPCs, which use an existing open file
+    without performing another lookup. (This is at least true on gofers that
+    proxy the host local filesystem; other filesystems that lack support for
+    e.g. certain operations on FDs may have different behavior, but this
+    divergence is at least still predictable and inherent to the underlying
+    filesystem implementation.)
+
+Note that this approach is only feasible in gVisor's next-generation virtual
+filesystem (VFS2), which does not assume the existence of inodes and allows the
+remote filesystem client to translate whole path-based syscalls into RPCs. Thus
+one of the unavoidable tradeoffs associated with such a protocol vs. 9P is the
+inability to construct a Linux client that is performance-competitive with
+gVisor.
+
+### File Permissions
+
+Many filesystem operations are side-effectual, such that file permissions must
+be checked before such operations take effect. The simplest approach to file
+permission checking is for the sentry to obtain permissions from the remote
+filesystem, then apply permission checks in the sentry before performing the
+application-requested operation. However, this requires an additional RPC per
+application syscall (which can't be mitigated by caching on shared filesystems).
+Alternatively, we may delegate file permission checking to gofers. In general,
+file permission checks depend on the following properties of the accessor:
+
+-   Filesystem UID/GID
+
+-   Supplementary GIDs
+
+-   Effective capabilities in the accessor's user namespace (i.e. the accessor's
+    effective capability set)
+
+-   All UIDs and GIDs mapped in the accessor's user namespace (which determine
+    if the accessor's capabilities apply to accessed files)
+
+We may choose to delay implementation of file permission checking delegation,
+although this is potentially costly since it doubles the number of required RPCs
+for most operations on shared filesystems. We may also consider compromise
+options, such as only delegating file permission checks for accessors in the
+root user namespace.
+
+### Symbolic Links
+
+gVisor usually interprets symbolic link targets in its VFS rather than on the
+filesystem containing the symbolic link; thus e.g. a symlink to
+"/proc/self/maps" on a remote filesystem resolves to said file in the sentry's
+procfs rather than the host's. This implies that:
+
+-   Remote filesystem servers that proxy filesystems supporting symlinks must
+    check if each path component is a symlink during path traversal.
+
+-   Absolute symlinks require that the sentry restart the operation at its
+    contextual VFS root (which is task-specific and may not be on a remote
+    filesystem at all), so if a remote filesystem server encounters an absolute
+    symlink during path traversal on behalf of a path-based operation, it must
+    terminate path traversal and return the symlink target.
+
+-   Relative symlinks begin target resolution in the parent directory of the
+    symlink, so in theory most relative symlinks can be handled automatically
+    during the path traversal that encounters the symlink, provided that said
+    traversal is supplied with the number of remaining symlinks before `ELOOP`.
+    However, the new path traversed by the symlink target may cross VFS mount
+    boundaries, such that it's only safe for remote filesystem servers to
+    speculatively follow relative symlinks for side-effect-free operations such
+    as `stat` (where the sentry can simply ignore results that are inapplicable
+    due to crossing mount boundaries). We may choose to delay implementation of
+    this feature, at the cost of an additional RPC per relative symlink (note
+    that even if the symlink target crosses a mount boundary, the sentry will
+    need to `stat` the path to the mount boundary to confirm that each traversed
+    component is an accessible directory); until it is implemented, relative
+    symlinks may be handled like absolute symlinks, by terminating path
+    traversal and returning the symlink target.
+
+The possibility of symlinks (and the possibility of a compromised sentry) means
+that the sentry may issue RPCs with paths that, in the absence of symlinks,
+would traverse beyond the root of the remote filesystem. For example, the sentry
+may issue an RPC with a path like "/foo/../..", on the premise that if "/foo" is
+a symlink then the resulting path may be elsewhere on the remote filesystem. To
+handle this, path traversal must also track its current depth below the remote
+filesystem root, and terminate path traversal if it would ascend beyond this
+point.
+
+### Path Traversal
+
+Since path-based VFS operations will translate to path-based RPCs, filesystem
+servers will need to handle path traversal. From the perspective of a given
+filesystem implementation in the server, there are two basic approaches to path
+traversal:
+
+-   Inode-walk: For each path component, obtain a handle to the underlying
+    filesystem object (e.g. with `open(O_PATH)`), check if that object is a
+    symlink (as described above) and that that object is accessible by the
+    caller (e.g. with `fstat()`), then continue to the next path component (e.g.
+    with `openat()`). This ensures that the checked filesystem object is the one
+    used to obtain the next object in the traversal, which is intuitively
+    appealing. However, while this approach works for host local filesystems, it
+    requires features that are not widely supported by other filesystems.
+
+-   Path-walk: For each path component, use a path-based operation to determine
+    if the filesystem object currently referred to by that path component is a
+    symlink / is accessible. This is highly portable, but suffers from quadratic
+    behavior (at the level of the underlying filesystem implementation, the
+    first path component will be traversed a number of times equal to the number
+    of path components in the path).
+
+The implementation should support either option by delegating path traversal to
+filesystem implementations within the server (like VFS and the remote filesystem
+protocol itself), as inode-walking is still safe, efficient, amenable to FD
+caching, and implementable on non-shared host local filesystems (a sufficiently
+common case as to be worth considering in the design).
+
+Both approaches are susceptible to race conditions that may permit sandboxed
+filesystem escapes:
+
+-   Under inode-walk, a malicious application may cause a directory to be moved
+    (with `rename`) during path traversal, such that the filesystem
+    implementation incorrectly determines whether subsequent inodes are located
+    in paths that should be visible to sandboxed applications.
+
+-   Under path-walk, a malicious application may cause a non-symlink file to be
+    replaced with a symlink during path traversal, such that following path
+    operations will incorrectly follow the symlink.
+
+Both race conditions can, to some extent, be mitigated in filesystem server
+implementations by synchronizing path traversal with the hazardous operations in
+question. However, shared filesystems are frequently used to share data between
+sandboxed and unsandboxed applications in a controlled way, and in some cases a
+malicious sandboxed application may be able to take advantage of a hazardous
+filesystem operation performed by an unsandboxed application. In some cases,
+filesystem features may be available to ensure safety even in such cases (e.g.
+[the new openat2() syscall](https://man7.org/linux/man-pages/man2/openat2.2.html)),
+but it is not clear how to solve this problem in general. (Note that this issue
+is not specific to our design; rather, it is a fundamental limitation of
+filesystem sandboxing.)
+
+### Filesystem Multiplexing
+
+A given sentry may need to access multiple distinct remote filesystems (e.g.
+different volumes for a given container). In many cases, there is no advantage
+to serving these filesystems from distinct filesystem servers, or accessing them
+through distinct connections (factors such as maximum RPC concurrency should be
+based on available host resources). Therefore, the protocol should support
+multiplexing of distinct filesystem trees within a single session. 9P supports
+this by allowing multiple calls to the `attach` RPC to produce fids representing
+distinct filesystem trees, but this is somewhat clunky; we propose a much
+simpler mechanism wherein each message that conveys a path also conveys a
+numeric filesystem ID that identifies a filesystem tree.
+
+## Alternatives Considered
+
+### Additional Extensions to 9P
+
+There are at least three conceptual aspects to 9P:
+
+-   Wire format: messages with a 4-byte little-endian size prefix, strings with
+    a 2-byte little-endian size prefix, etc. Whether the wire format is worth
+    retaining is unclear; in particular, it's unclear that the 9P wire format
+    has a significant advantage over protobufs, which are substantially easier
+    to extend. Note that the official Go protobuf implementation is widely known
+    to suffer from a significant number of performance deficiencies, so if we
+    choose to switch to protobuf, we may need to use an alternative toolchain
+    such as `gogo/protobuf` (which is also widely used in the Go ecosystem, e.g.
+    by Kubernetes).
+
+-   Filesystem model: fids, qids, etc. Discarding this is one of the motivations
+    for this proposal.
+
+-   RPCs: Twalk, Tlopen, etc. In addition to previously-described
+    inefficiencies, most of these are dependent on the filesystem model and
+    therefore must be discarded.
+
+### FUSE
+
+The FUSE (Filesystem in Userspace) protocol is frequently used to provide
+arbitrary userspace filesystem implementations to a host Linux kernel.
+Unfortunately, FUSE is also inode-based, and therefore doesn't address any of
+the problems we have with 9P.
+
+### virtio-fs
+
+virtio-fs is an ongoing project aimed at improving Linux VM filesystem
+performance when accessing Linux host filesystems (vs. virtio-9p). In brief, it
+is based on:
+
+-   Using a FUSE client in the guest that communicates over virtio with a FUSE
+    server in the host.
+
+-   Using DAX to map the host page cache into the guest.
+
+-   Using a file metadata table in shared memory to avoid VM exits for metadata
+    updates.
+
+None of these improvements seem applicable to gVisor:
+
+-   As explained above, FUSE is still inode-based, so it is still susceptible to
+    most of the problems we have with 9P.
+
+-   Our use of host file descriptors already allows us to leverage the host page
+    cache for file contents.
+
+-   Our need for shared filesystem coherence is usually based on a user
+    requirement that an out-of-sandbox filesystem mutation is guaranteed to be
+    visible by all subsequent observations from within the sandbox, or vice
+    versa; it's not clear that this can be guaranteed without a synchronous
+    signaling mechanism like an RPC.
diff --git a/tools/go_marshal/marshal/BUILD b/pkg/marshal/BUILD
index 4aec98218..aac0161fa 100644
--- a/tools/go_marshal/marshal/BUILD
+++ b/pkg/marshal/BUILD
@@ -11,7 +11,5 @@ go_library(
     visibility = [
         "//:sandbox",
     ],
-    deps = [
-        "//pkg/usermem",
-    ],
+    deps = ["//pkg/usermem"],
 )
diff --git a/tools/go_marshal/marshal/marshal.go b/pkg/marshal/marshal.go
index 85b196f08..d8cb44b40 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/pkg/marshal/marshal.go
@@ -26,9 +26,10 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Task provides a subset of kernel.Task, used in marshalling. We don't import
-// the kernel package directly to avoid circular dependency.
-type Task interface {
+// CopyContext defines the memory operations required to marshal to and from
+// user memory. Typically, kernel.Task is used to provide implementations for
+// these operations.
+type CopyContext interface {
 	// CopyScratchBuffer provides a task goroutine-local scratch buffer. See
 	// kernel.CopyScratchBuffer.
 	CopyScratchBuffer(size int) []byte
@@ -107,7 +108,7 @@ type Marshallable interface {
 	// If the copy-in from the task memory is only partially successful, CopyIn
 	// should still attempt to deserialize as much data as possible. See comment
 	// for UnmarshalBytes.
-	CopyIn(task Task, addr usermem.Addr) (int, error)
+	CopyIn(cc CopyContext, addr usermem.Addr) (int, error)
 
 	// CopyOut serializes a Marshallable type to a task's memory. This may only
 	// be called from a task goroutine. This is more efficient than calling
@@ -118,7 +119,7 @@ type Marshallable interface {
 	// The copy-out to the task memory may be partially successful, in which
 	// case CopyOut returns how much data was serialized. See comment for
 	// MarshalBytes for implications.
-	CopyOut(task Task, addr usermem.Addr) (int, error)
+	CopyOut(cc CopyContext, addr usermem.Addr) (int, error)
 
 	// CopyOutN is like CopyOut, but explicitly requests a partial
 	// copy-out. Note that this may yield unexpected results for non-packed
@@ -126,7 +127,7 @@ type Marshallable interface {
 	// comment on MarshalBytes.
 	//
 	// The limit must be less than or equal to SizeBytes().
-	CopyOutN(task Task, addr usermem.Addr, limit int) (int, error)
+	CopyOutN(cc CopyContext, addr usermem.Addr, limit int) (int, error)
 }
 
 // go-marshal generates additional functions for a type based on additional
@@ -156,10 +157,10 @@ type Marshallable interface {
 // func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... }
 //
 // // CopyFooSliceIn copies in a slice of Foo objects from the task's memory.
-// func CopyFooSliceIn(task marshal.Task, addr usermem.Addr, dst []Foo) (int, error) { ... }
+// func CopyFooSliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []Foo) (int, error) { ... }
 //
 // // CopyFooSliceIn copies out a slice of Foo objects to the task's memory.
-// func CopyFooSliceOut(task marshal.Task, addr usermem.Addr, src []Foo) (int, error) { ... }
+// func CopyFooSliceOut(cc marshal.CopyContext, addr usermem.Addr, src []Foo) (int, error) { ... }
 //
 // The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where
 // %s is the first argument to the slice clause. This directive is not supported
@@ -174,10 +175,10 @@ type Marshallable interface {
 // This is only valid on newtypes on primitives, and causes the generated
 // functions to accept slices of the inner type instead:
 //
-// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []int32) (int, error) { ... }
+// func CopyInt32SliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []int32) (int, error) { ... }
 //
 // Without "inner", they would instead be:
 //
-// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []Int32) (int, error) { ... }
+// func CopyInt32SliceIn(cc marshal.CopyContext, addr usermem.Addr, dst []Int32) (int, error) { ... }
 //
 // This may help avoid a cast depending on how the generated functions are used.
diff --git a/tools/go_marshal/marshal/marshal_impl_util.go b/pkg/marshal/marshal_impl_util.go
index 89c7d3575..ea75e09f2 100644
--- a/tools/go_marshal/marshal/marshal_impl_util.go
+++ b/pkg/marshal/marshal_impl_util.go
@@ -44,7 +44,7 @@ func (StubMarshallable) MarshalBytes(dst []byte) {
 
 // UnmarshalBytes implements Marshallable.UnmarshalBytes.
 func (StubMarshallable) UnmarshalBytes(src []byte) {
-	panic("Please implement your own UnMarshalBytes function")
+	panic("Please implement your own UnmarshalBytes function")
 }
 
 // Packed implements Marshallable.Packed.
@@ -63,16 +63,16 @@ func (StubMarshallable) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements Marshallable.CopyIn.
-func (StubMarshallable) CopyIn(task Task, addr usermem.Addr) (int, error) {
+func (StubMarshallable) CopyIn(cc CopyContext, addr usermem.Addr) (int, error) {
 	panic("Please implement your own CopyIn function")
 }
 
 // CopyOut implements Marshallable.CopyOut.
-func (StubMarshallable) CopyOut(task Task, addr usermem.Addr) (int, error) {
+func (StubMarshallable) CopyOut(cc CopyContext, addr usermem.Addr) (int, error) {
 	panic("Please implement your own CopyOut function")
 }
 
 // CopyOutN implements Marshallable.CopyOutN.
-func (StubMarshallable) CopyOutN(task Task, addr usermem.Addr, limit int) (int, error) {
+func (StubMarshallable) CopyOutN(cc CopyContext, addr usermem.Addr, limit int) (int, error) {
 	panic("Please implement your own CopyOutN function")
 }
diff --git a/tools/go_marshal/primitive/BUILD b/pkg/marshal/primitive/BUILD
index cc08ba63a..d77a11c79 100644
--- a/tools/go_marshal/primitive/BUILD
+++ b/pkg/marshal/primitive/BUILD
@@ -12,7 +12,8 @@ go_library(
         "//:sandbox",
     ],
     deps = [
+        "//pkg/context",
+        "//pkg/marshal",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/tools/go_marshal/primitive/primitive.go b/pkg/marshal/primitive/primitive.go
index d93edda8b..4b342de6b 100644
--- a/tools/go_marshal/primitive/primitive.go
+++ b/pkg/marshal/primitive/primitive.go
@@ -19,8 +19,9 @@ package primitive
 import (
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // Int8 is a marshal.Marshallable implementation for int8.
@@ -101,18 +102,18 @@ func (b *ByteSlice) UnmarshalUnsafe(src []byte) {
 }
 
 // CopyIn implements marshal.Marshallable.CopyIn.
-func (b *ByteSlice) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	return task.CopyInBytes(addr, *b)
+func (b *ByteSlice) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	return cc.CopyInBytes(addr, *b)
 }
 
 // CopyOut implements marshal.Marshallable.CopyOut.
-func (b *ByteSlice) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
-	return task.CopyOutBytes(addr, *b)
+func (b *ByteSlice) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {
+	return cc.CopyOutBytes(addr, *b)
 }
 
 // CopyOutN implements marshal.Marshallable.CopyOutN.
-func (b *ByteSlice) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
-	return task.CopyOutBytes(addr, (*b)[:limit])
+func (b *ByteSlice) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+	return cc.CopyOutBytes(addr, (*b)[:limit])
 }
 
 // WriteTo implements io.WriterTo.WriteTo.
@@ -126,13 +127,53 @@ var _ marshal.Marshallable = (*ByteSlice)(nil)
 // Below, we define some convenience functions for marshalling primitive types
 // using the newtypes above, without requiring superfluous casts.
 
+// 8-bit integers
+
+// CopyInt8In is a convenient wrapper for copying in an int8 from the task's
+// memory.
+func CopyInt8In(cc marshal.CopyContext, addr usermem.Addr, dst *int8) (int, error) {
+	var buf Int8
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int8(buf)
+	return n, nil
+}
+
+// CopyInt8Out is a convenient wrapper for copying out an int8 to the task's
+// memory.
+func CopyInt8Out(cc marshal.CopyContext, addr usermem.Addr, src int8) (int, error) {
+	srcP := Int8(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyUint8In is a convenient wrapper for copying in a uint8 from the task's
+// memory.
+func CopyUint8In(cc marshal.CopyContext, addr usermem.Addr, dst *uint8) (int, error) {
+	var buf Uint8
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint8(buf)
+	return n, nil
+}
+
+// CopyUint8Out is a convenient wrapper for copying out a uint8 to the task's
+// memory.
+func CopyUint8Out(cc marshal.CopyContext, addr usermem.Addr, src uint8) (int, error) {
+	srcP := Uint8(src)
+	return srcP.CopyOut(cc, addr)
+}
+
 // 16-bit integers
 
 // CopyInt16In is a convenient wrapper for copying in an int16 from the task's
 // memory.
-func CopyInt16In(task marshal.Task, addr usermem.Addr, dst *int16) (int, error) {
+func CopyInt16In(cc marshal.CopyContext, addr usermem.Addr, dst *int16) (int, error) {
 	var buf Int16
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -142,16 +183,16 @@ func CopyInt16In(task marshal.Task, addr usermem.Addr, dst *int16) (int, error)
 
 // CopyInt16Out is a convenient wrapper for copying out an int16 to the task's
 // memory.
-func CopyInt16Out(task marshal.Task, addr usermem.Addr, src int16) (int, error) {
+func CopyInt16Out(cc marshal.CopyContext, addr usermem.Addr, src int16) (int, error) {
 	srcP := Int16(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
 }
 
 // CopyUint16In is a convenient wrapper for copying in a uint16 from the task's
 // memory.
-func CopyUint16In(task marshal.Task, addr usermem.Addr, dst *uint16) (int, error) {
+func CopyUint16In(cc marshal.CopyContext, addr usermem.Addr, dst *uint16) (int, error) {
 	var buf Uint16
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -161,18 +202,18 @@ func CopyUint16In(task marshal.Task, addr usermem.Addr, dst *uint16) (int, error
 
 // CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's
 // memory.
-func CopyUint16Out(task marshal.Task, addr usermem.Addr, src uint16) (int, error) {
+func CopyUint16Out(cc marshal.CopyContext, addr usermem.Addr, src uint16) (int, error) {
 	srcP := Uint16(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
 }
 
 // 32-bit integers
 
 // CopyInt32In is a convenient wrapper for copying in an int32 from the task's
 // memory.
-func CopyInt32In(task marshal.Task, addr usermem.Addr, dst *int32) (int, error) {
+func CopyInt32In(cc marshal.CopyContext, addr usermem.Addr, dst *int32) (int, error) {
 	var buf Int32
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -182,16 +223,16 @@ func CopyInt32In(task marshal.Task, addr usermem.Addr, dst *int32) (int, error)
 
 // CopyInt32Out is a convenient wrapper for copying out an int32 to the task's
 // memory.
-func CopyInt32Out(task marshal.Task, addr usermem.Addr, src int32) (int, error) {
+func CopyInt32Out(cc marshal.CopyContext, addr usermem.Addr, src int32) (int, error) {
 	srcP := Int32(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
 }
 
 // CopyUint32In is a convenient wrapper for copying in a uint32 from the task's
 // memory.
-func CopyUint32In(task marshal.Task, addr usermem.Addr, dst *uint32) (int, error) {
+func CopyUint32In(cc marshal.CopyContext, addr usermem.Addr, dst *uint32) (int, error) {
 	var buf Uint32
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -201,18 +242,18 @@ func CopyUint32In(task marshal.Task, addr usermem.Addr, dst *uint32) (int, error
 
 // CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's
 // memory.
-func CopyUint32Out(task marshal.Task, addr usermem.Addr, src uint32) (int, error) {
+func CopyUint32Out(cc marshal.CopyContext, addr usermem.Addr, src uint32) (int, error) {
 	srcP := Uint32(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
 }
 
 // 64-bit integers
 
 // CopyInt64In is a convenient wrapper for copying in an int64 from the task's
 // memory.
-func CopyInt64In(task marshal.Task, addr usermem.Addr, dst *int64) (int, error) {
+func CopyInt64In(cc marshal.CopyContext, addr usermem.Addr, dst *int64) (int, error) {
 	var buf Int64
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -222,16 +263,16 @@ func CopyInt64In(task marshal.Task, addr usermem.Addr, dst *int64) (int, error)
 
 // CopyInt64Out is a convenient wrapper for copying out an int64 to the task's
 // memory.
-func CopyInt64Out(task marshal.Task, addr usermem.Addr, src int64) (int, error) {
+func CopyInt64Out(cc marshal.CopyContext, addr usermem.Addr, src int64) (int, error) {
 	srcP := Int64(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
 }
 
 // CopyUint64In is a convenient wrapper for copying in a uint64 from the task's
 // memory.
-func CopyUint64In(task marshal.Task, addr usermem.Addr, dst *uint64) (int, error) {
+func CopyUint64In(cc marshal.CopyContext, addr usermem.Addr, dst *uint64) (int, error) {
 	var buf Uint64
-	n, err := buf.CopyIn(task, addr)
+	n, err := buf.CopyIn(cc, addr)
 	if err != nil {
 		return n, err
 	}
@@ -241,7 +282,68 @@ func CopyUint64In(task marshal.Task, addr usermem.Addr, dst *uint64) (int, error
 
 // CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's
 // memory.
-func CopyUint64Out(task marshal.Task, addr usermem.Addr, src uint64) (int, error) {
+func CopyUint64Out(cc marshal.CopyContext, addr usermem.Addr, src uint64) (int, error) {
 	srcP := Uint64(src)
-	return srcP.CopyOut(task, addr)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyByteSliceIn is a convenient wrapper for copying in a []byte from the
+// task's memory.
+func CopyByteSliceIn(cc marshal.CopyContext, addr usermem.Addr, dst *[]byte) (int, error) {
+	var buf ByteSlice
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = []byte(buf)
+	return n, nil
+}
+
+// CopyByteSliceOut is a convenient wrapper for copying out a []byte to the
+// task's memory.
+func CopyByteSliceOut(cc marshal.CopyContext, addr usermem.Addr, src []byte) (int, error) {
+	srcP := ByteSlice(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// CopyStringIn is a convenient wrapper for copying in a string from the
+// task's memory.
+func CopyStringIn(cc marshal.CopyContext, addr usermem.Addr, dst *string) (int, error) {
+	var buf ByteSlice
+	n, err := buf.CopyIn(cc, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = string(buf)
+	return n, nil
+}
+
+// CopyStringOut is a convenient wrapper for copying out a string to the task's
+// memory.
+func CopyStringOut(cc marshal.CopyContext, addr usermem.Addr, src string) (int, error) {
+	srcP := ByteSlice(src)
+	return srcP.CopyOut(cc, addr)
+}
+
+// IOCopyContext wraps an object implementing usermem.IO to implement
+// marshal.CopyContext.
+type IOCopyContext struct {
+	Ctx  context.Context
+	IO   usermem.IO
+	Opts usermem.IOOpts
+}
+
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (i *IOCopyContext) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (i *IOCopyContext) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+	return i.IO.CopyOut(i.Ctx, addr, b, i.Opts)
+}
+
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (i *IOCopyContext) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+	return i.IO.CopyIn(i.Ctx, addr, b, i.Opts)
 }
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
index 955c9c473..d8227b8bd 100644
--- a/pkg/merkletree/merkletree.go
+++ b/pkg/merkletree/merkletree.go
@@ -29,13 +29,19 @@ const (
 	sha256DigestSize = 32
 )
 
+// DigestSize returns the size (in bytes) of a digest.
+// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
+func DigestSize() int {
+	return sha256DigestSize
+}
+
 // Layout defines the scale of a Merkle tree.
 type Layout struct {
 	// blockSize is the size of a data block to be hashed.
 	blockSize int64
 	// digestSize is the size of a generated hash.
 	digestSize int64
-	// levelOffset contains the offset of the begnning of each level in
+	// levelOffset contains the offset of the beginning of each level in
 	// bytes. The number of levels in the tree is the length of the slice.
 	// The leaf nodes (level 0) contain hashes of blocks of the input data.
 	// Each level N contains hashes of the blocks in level N-1. The highest
@@ -45,12 +51,25 @@ type Layout struct {
 
 // InitLayout initializes and returns a new Layout object describing the structure
 // of a tree. dataSize specifies the size of input data in bytes.
-func InitLayout(dataSize int64) Layout {
+func InitLayout(dataSize int64, dataAndTreeInSameFile bool) Layout {
 	layout := Layout{
 		blockSize: usermem.PageSize,
 		// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
 		digestSize: sha256DigestSize,
 	}
+
+	// treeStart is the offset (in bytes) of the first level of the tree in
+	// the file. If data and tree are in different files, treeStart should
+	// be zero. If data is in the same file as the tree, treeStart points
+	// to the block after the last data block (which may be zero-padded).
+	var treeStart int64
+	if dataAndTreeInSameFile {
+		treeStart = dataSize
+		if dataSize%layout.blockSize != 0 {
+			treeStart += layout.blockSize - dataSize%layout.blockSize
+		}
+	}
+
 	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
 	level := 0
 	offset := int64(0)
@@ -60,14 +79,15 @@ func InitLayout(dataSize int64) Layout {
 	// contain the hashes of the data blocks, while level numLevels - 1 is
 	// the root.
 	for numBlocks > 1 {
-		layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+		layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
 		// Round numBlocks up to fill up a block.
 		numBlocks += (layout.hashesPerBlock() - numBlocks%layout.hashesPerBlock()) % layout.hashesPerBlock()
 		offset += numBlocks / layout.hashesPerBlock()
 		numBlocks = numBlocks / layout.hashesPerBlock()
 		level++
 	}
-	layout.levelOffset = append(layout.levelOffset, offset*layout.blockSize)
+	layout.levelOffset = append(layout.levelOffset, treeStart+offset*layout.blockSize)
+
 	return layout
 }
 
@@ -103,14 +123,72 @@ func (layout Layout) blockOffset(level int, index int64) int64 {
 	return layout.levelOffset[level] + index*layout.blockSize
 }
 
-// Generate constructs a Merkle tree for the contents of data. The output is
-// written to treeWriter. The treeReader should be able to read the tree after
-// it has been written. That is, treeWriter and treeReader should point to the
-// same underlying data but have separate cursors.
-func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter io.Writer) ([]byte, error) {
-	layout := InitLayout(dataSize)
+// VerityDescriptor is a struct that is serialized and hashed to get a file's
+// root hash, which contains the root hash of the raw content and the file's
+// meatadata.
+type VerityDescriptor struct {
+	Name     string
+	Mode     uint32
+	UID      uint32
+	GID      uint32
+	RootHash []byte
+}
 
-	numBlocks := (dataSize + layout.blockSize - 1) / layout.blockSize
+func (d *VerityDescriptor) String() string {
+	return fmt.Sprintf("Name: %s, Mode: %d, UID: %d, GID: %d, RootHash: %v", d.Name, d.Mode, d.UID, d.GID, d.RootHash)
+}
+
+// verify generates a hash from d, and compares it with expected.
+func (d *VerityDescriptor) verify(expected []byte) error {
+	h := sha256.Sum256([]byte(d.String()))
+	if !bytes.Equal(h[:], expected) {
+		return fmt.Errorf("unexpected root hash")
+	}
+	return nil
+}
+
+// GenerateParams contains the parameters used to generate a Merkle tree.
+type GenerateParams struct {
+	// File is a reader of the file to be hashed.
+	File io.ReaderAt
+	// Size is the size of the file.
+	Size int64
+	// Name is the name of the target file.
+	Name string
+	// Mode is the mode of the target file.
+	Mode uint32
+	// UID is the user ID of the target file.
+	UID uint32
+	// GID is the group ID of the target file.
+	GID uint32
+	// TreeReader is a reader for the Merkle tree.
+	TreeReader io.ReaderAt
+	// TreeWriter is a writer for the Merkle tree.
+	TreeWriter io.Writer
+	// DataAndTreeInSameFile is true if data and Merkle tree are in the same
+	// file, or false if Merkle tree is a separate file from data.
+	DataAndTreeInSameFile bool
+}
+
+// Generate constructs a Merkle tree for the contents of params.File. The
+// output is written to params.TreeWriter.
+//
+// Generate returns a hash of a VerityDescriptor, which contains the file
+// metadata and the hash from file content.
+func Generate(params *GenerateParams) ([]byte, error) {
+	layout := InitLayout(params.Size, params.DataAndTreeInSameFile)
+
+	numBlocks := (params.Size + layout.blockSize - 1) / layout.blockSize
+
+	// If the data is in the same file as the tree, zero pad the last data
+	// block.
+	bytesInLastBlock := params.Size % layout.blockSize
+	if params.DataAndTreeInSameFile && bytesInLastBlock != 0 {
+		zeroBuf := make([]byte, layout.blockSize-bytesInLastBlock)
+		if _, err := params.TreeWriter.Write(zeroBuf); err != nil {
+			return nil, err
+		}
+	}
 
 	var root []byte
 	for level := 0; level < layout.numLevels(); level++ {
@@ -123,11 +201,11 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 			if level == 0 {
 				// Read data block from the target file since level 0 includes hashes
 				// of blocks in the input data.
-				n, err = data.Read(buf)
+				n, err = params.File.ReadAt(buf, i*layout.blockSize)
 			} else {
 				// Read data block from the tree file since levels higher than 0 are
 				// hashing the lower level hashes.
-				n, err = treeReader.Read(buf)
+				n, err = params.TreeReader.ReadAt(buf, layout.blockOffset(level-1, i))
 			}
 
 			// err is populated as long as the bytes read is smaller than the buffer
@@ -147,7 +225,7 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 			}
 
 			// Write the generated hash to the end of the tree file.
-			if _, err = treeWriter.Write(digest[:]); err != nil {
+			if _, err = params.TreeWriter.Write(digest[:]); err != nil {
 				return nil, err
 			}
 		}
@@ -155,61 +233,111 @@ func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter i
 		// remaining of the last block. But no need to do so for root.
 		if level != layout.rootLevel() && numBlocks%layout.hashesPerBlock() != 0 {
 			zeroBuf := make([]byte, layout.blockSize-(numBlocks%layout.hashesPerBlock())*layout.digestSize)
-			if _, err := treeWriter.Write(zeroBuf[:]); err != nil {
+			if _, err := params.TreeWriter.Write(zeroBuf[:]); err != nil {
 				return nil, err
 			}
 		}
 		numBlocks = (numBlocks + layout.hashesPerBlock() - 1) / layout.hashesPerBlock()
 	}
-	return root, nil
+	descriptor := VerityDescriptor{
+		Name:     params.Name,
+		Mode:     params.Mode,
+		UID:      params.UID,
+		GID:      params.GID,
+		RootHash: root,
+	}
+	ret := sha256.Sum256([]byte(descriptor.String()))
+	return ret[:], nil
+}
+
+// VerifyParams contains the params used to verify a portion of a file against
+// a Merkle tree.
+type VerifyParams struct {
+	// Out will be filled with verified data.
+	Out io.Writer
+	// File is a handler on the file to be verified.
+	File io.ReaderAt
+	// tree is a handler on the Merkle tree used to verify file.
+	Tree io.ReaderAt
+	// Size is the size of the file.
+	Size int64
+	// Name is the name of the target file.
+	Name string
+	// Mode is the mode of the target file.
+	Mode uint32
+	// UID is the user ID of the target file.
+	UID uint32
+	// GID is the group ID of the target file.
+	GID uint32
+	// ReadOffset is the offset of the data range to be verified.
+	ReadOffset int64
+	// ReadSize is the size of the data range to be verified.
+	ReadSize int64
+	// Expected is a trusted hash for the file. It is compared with the
+	// calculated root hash to verify the content.
+	Expected []byte
+	// DataAndTreeInSameFile is true if data and Merkle tree are in the same
+	// file, or false if Merkle tree is a separate file from data.
+	DataAndTreeInSameFile bool
+}
+
+// verifyMetadata verifies the metadata by hashing a descriptor that contains
+// the metadata and compare the generated hash with expected.
+//
+// For verifyMetadata, params.data is not needed. It only accesses params.tree
+// for the raw root hash.
+func verifyMetadata(params *VerifyParams, layout *Layout) error {
+	root := make([]byte, layout.digestSize)
+	if _, err := params.Tree.ReadAt(root, layout.blockOffset(layout.rootLevel(), 0 /* index */)); err != nil {
+		return fmt.Errorf("failed to read root hash: %w", err)
+	}
+	descriptor := VerityDescriptor{
+		Name:     params.Name,
+		Mode:     params.Mode,
+		UID:      params.UID,
+		GID:      params.GID,
+		RootHash: root,
+	}
+	return descriptor.verify(params.Expected)
 }
 
 // Verify verifies the content read from data with offset. The content is
 // verified against tree. If content spans across multiple blocks, each block is
 // verified. Verification fails if the hash of the data does not match the tree
-// at any level, or if the final root hash does not match expectedRoot.
-// Once the data is verified, it will be written using w.
-// Verify will modify the cursor for data, but always restores it to its
-// original position upon exit. The cursor for tree is modified and not
-// restored.
-func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset int64, readSize int64, expectedRoot []byte) error {
-	if readSize <= 0 {
-		return fmt.Errorf("Unexpected read size: %d", readSize)
+// at any level, or if the final root hash does not match expected.
+// Once the data is verified, it will be written using params.Out.
+//
+// Verify checks for both target file content and metadata. If readSize is 0,
+// only metadata is checked.
+func Verify(params *VerifyParams) (int64, error) {
+	if params.ReadSize < 0 {
+		return 0, fmt.Errorf("unexpected read size: %d", params.ReadSize)
+	}
+	layout := InitLayout(int64(params.Size), params.DataAndTreeInSameFile)
+	if params.ReadSize == 0 {
+		return 0, verifyMetadata(params, &layout)
 	}
-	layout := InitLayout(int64(dataSize))
 
 	// Calculate the index of blocks that includes the target range in input
 	// data.
-	firstDataBlock := readOffset / layout.blockSize
-	lastDataBlock := (readOffset + readSize - 1) / layout.blockSize
-
-	// Store the current offset, so we can set it back once verification
-	// finishes.
-	origOffset, err := data.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return fmt.Errorf("Find current data offset failed: %v", err)
-	}
-	defer data.Seek(origOffset, io.SeekStart)
-
-	// Move to the first block that contains target data.
-	if _, err := data.Seek(firstDataBlock*layout.blockSize, io.SeekStart); err != nil {
-		return fmt.Errorf("Seek to datablock start failed: %v", err)
-	}
+	firstDataBlock := params.ReadOffset / layout.blockSize
+	lastDataBlock := (params.ReadOffset + params.ReadSize - 1) / layout.blockSize
 
 	buf := make([]byte, layout.blockSize)
 	var readErr error
-	bytesRead := 0
+	total := int64(0)
 	for i := firstDataBlock; i <= lastDataBlock; i++ {
 		// Read a block that includes all or part of target range in
 		// input data.
-		bytesRead, readErr = data.Read(buf)
+		bytesRead, err := params.File.ReadAt(buf, i*layout.blockSize)
+		readErr = err
 		// If at the end of input data and all previous blocks are
 		// verified, return the verified input data and EOF.
 		if readErr == io.EOF && bytesRead == 0 {
 			break
 		}
 		if readErr != nil && readErr != io.EOF {
-			return fmt.Errorf("Read from data failed: %v", err)
+			return 0, fmt.Errorf("read from data failed: %w", err)
 		}
 		// If this is the end of file, zero the remaining bytes in buf,
 		// otherwise they are still from the previous block.
@@ -220,22 +348,29 @@ func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset in
 				buf[j] = 0
 			}
 		}
-		if err := verifyBlock(tree, layout, buf, i, expectedRoot); err != nil {
-			return err
+		descriptor := VerityDescriptor{
+			Name: params.Name,
+			Mode: params.Mode,
+			UID:  params.UID,
+			GID:  params.GID,
 		}
+		if err := verifyBlock(params.Tree, &descriptor, &layout, buf, i, params.Expected); err != nil {
+			return 0, err
+		}
+
 		// startOff is the beginning of the read range within the
 		// current data block. Note that for all blocks other than the
 		// first, startOff should be 0.
 		startOff := int64(0)
 		if i == firstDataBlock {
-			startOff = readOffset % layout.blockSize
+			startOff = params.ReadOffset % layout.blockSize
 		}
 		// endOff is the end of the read range within the current data
 		// block. Note that for all blocks other than the last,  endOff
 		// should be the block size.
 		endOff := layout.blockSize
 		if i == lastDataBlock {
-			endOff = (readOffset+readSize-1)%layout.blockSize + 1
+			endOff = (params.ReadOffset+params.ReadSize-1)%layout.blockSize + 1
 		}
 		// If the provided size exceeds the end of input data, we should
 		// only copy the parts in buf that's part of input data.
@@ -245,19 +380,22 @@ func Verify(w io.Writer, data, tree io.ReadSeeker, dataSize int64, readOffset in
 		if endOff > int64(bytesRead) {
 			endOff = int64(bytesRead)
 		}
-		w.Write(buf[startOff:endOff])
+		n, err := params.Out.Write(buf[startOff:endOff])
+		if err != nil {
+			return total, err
+		}
+		total += int64(n)
 
 	}
-	return readErr
+	return total, readErr
 }
 
 // verifyBlock verifies a block against tree. index is the number of block in
 // original data. The block is verified through each level of the tree. It
 // fails if the calculated hash from block is different from any level of
 // hashes stored in tree. And the final root hash is compared with
-// expectedRoot.  verifyBlock modifies the cursor for tree. Users needs to
-// maintain the cursor if intended.
-func verifyBlock(tree io.ReadSeeker, layout Layout, dataBlock []byte, blockIndex int64, expectedRoot []byte) error {
+// expected.
+func verifyBlock(tree io.ReaderAt, descriptor *VerityDescriptor, layout *Layout, dataBlock []byte, blockIndex int64, expected []byte) error {
 	if len(dataBlock) != int(layout.blockSize) {
 		return fmt.Errorf("incorrect block size")
 	}
@@ -274,41 +412,27 @@ func verifyBlock(tree io.ReadSeeker, layout Layout, dataBlock []byte, blockIndex
 			// Read a block in previous level that contains the
 			// hash we just generated, and generate a next level
 			// hash from it.
-			if _, err := tree.Seek(layout.blockOffset(level-1, blockIndex), io.SeekStart); err != nil {
-				return err
-			}
-			if _, err := tree.Read(treeBlock); err != nil {
+			if _, err := tree.ReadAt(treeBlock, layout.blockOffset(level-1, blockIndex)); err != nil {
 				return err
 			}
 			digestArray := sha256.Sum256(treeBlock)
 			digest = digestArray[:]
 		}
 
-		// Move to stored hash for the current block, read the digest
-		// and store in expectedDigest.
-		if _, err := tree.Seek(layout.digestOffset(level, blockIndex), io.SeekStart); err != nil {
-			return err
-		}
-		if _, err := tree.Read(expectedDigest); err != nil {
+		// Read the digest for the current block and store in
+		// expectedDigest.
+		if _, err := tree.ReadAt(expectedDigest, layout.digestOffset(level, blockIndex)); err != nil {
 			return err
 		}
 
 		if !bytes.Equal(digest, expectedDigest) {
-			return fmt.Errorf("Verification failed")
-		}
-
-		// If this is the root layer, no need to generate next level
-		// hash.
-		if level == layout.rootLevel() {
-			break
+			return fmt.Errorf("verification failed")
 		}
 		blockIndex = blockIndex / layout.hashesPerBlock()
 	}
 
-	// Verification for the tree succeeded. Now compare the root hash in the
-	// tree with expectedRoot.
-	if !bytes.Equal(digest[:], expectedRoot) {
-		return fmt.Errorf("Verification failed")
-	}
-	return nil
+	// Verification for the tree succeeded. Now hash the descriptor with
+	// the root hash and compare it with expected.
+	descriptor.RootHash = digest
+	return descriptor.verify(expected)
 }
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
index 911f61df9..e1350ebda 100644
--- a/pkg/merkletree/merkletree_test.go
+++ b/pkg/merkletree/merkletree_test.go
@@ -27,130 +27,153 @@ import (
 
 func TestLayout(t *testing.T) {
 	testCases := []struct {
-		dataSize            int64
-		expectedLevelOffset []int64
+		dataSize              int64
+		dataAndTreeInSameFile bool
+		expectedLevelOffset   []int64
 	}{
 		{
-			dataSize:            100,
-			expectedLevelOffset: []int64{0},
+			dataSize:              100,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0},
 		},
 		{
-			dataSize:            1000000,
-			expectedLevelOffset: []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
+			dataSize:              100,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{usermem.PageSize},
 		},
 		{
-			dataSize:            4096 * int64(usermem.PageSize),
-			expectedLevelOffset: []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
+			dataSize:              1000000,
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 2 * usermem.PageSize, 3 * usermem.PageSize},
+		},
+		{
+			dataSize:              1000000,
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{245 * usermem.PageSize, 247 * usermem.PageSize, 248 * usermem.PageSize},
+		},
+		{
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: false,
+			expectedLevelOffset:   []int64{0, 32 * usermem.PageSize, 33 * usermem.PageSize},
+		},
+		{
+			dataSize:              4096 * int64(usermem.PageSize),
+			dataAndTreeInSameFile: true,
+			expectedLevelOffset:   []int64{4096 * usermem.PageSize, 4128 * usermem.PageSize, 4129 * usermem.PageSize},
 		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
-			p := InitLayout(tc.dataSize)
-			if p.blockSize != int64(usermem.PageSize) {
-				t.Errorf("got blockSize %d, want %d", p.blockSize, usermem.PageSize)
+			l := InitLayout(tc.dataSize, tc.dataAndTreeInSameFile)
+			if l.blockSize != int64(usermem.PageSize) {
+				t.Errorf("Got blockSize %d, want %d", l.blockSize, usermem.PageSize)
 			}
-			if p.digestSize != sha256DigestSize {
-				t.Errorf("got digestSize %d, want %d", p.digestSize, sha256DigestSize)
+			if l.digestSize != sha256DigestSize {
+				t.Errorf("Got digestSize %d, want %d", l.digestSize, sha256DigestSize)
 			}
-			if p.numLevels() != len(tc.expectedLevelOffset) {
-				t.Errorf("got levels %d, want %d", p.numLevels(), len(tc.expectedLevelOffset))
+			if l.numLevels() != len(tc.expectedLevelOffset) {
+				t.Errorf("Got levels %d, want %d", l.numLevels(), len(tc.expectedLevelOffset))
 			}
-			for i := 0; i < p.numLevels() && i < len(tc.expectedLevelOffset); i++ {
-				if p.levelOffset[i] != tc.expectedLevelOffset[i] {
-					t.Errorf("got levelStart[%d] %d, want %d", i, p.levelOffset[i], tc.expectedLevelOffset[i])
+			for i := 0; i < l.numLevels() && i < len(tc.expectedLevelOffset); i++ {
+				if l.levelOffset[i] != tc.expectedLevelOffset[i] {
+					t.Errorf("Got levelStart[%d] %d, want %d", i, l.levelOffset[i], tc.expectedLevelOffset[i])
 				}
 			}
 		})
 	}
 }
 
+const (
+	defaultName = "merkle_test"
+	defaultMode = 0644
+	defaultUID  = 0
+	defaultGID  = 0
+)
+
+// bytesReadWriter is used to read from/write to/seek in a byte array. Unlike
+// bytes.Buffer, it keeps the whole buffer during read so that it can be reused.
+type bytesReadWriter struct {
+	// bytes contains the underlying byte array.
+	bytes []byte
+	// readPos is the currently location for Read. Write always appends to
+	// the end of the array.
+	readPos int
+}
+
+func (brw *bytesReadWriter) Write(p []byte) (int, error) {
+	brw.bytes = append(brw.bytes, p...)
+	return len(p), nil
+}
+
+func (brw *bytesReadWriter) ReadAt(p []byte, off int64) (int, error) {
+	bytesRead := copy(p, brw.bytes[off:])
+	if bytesRead == 0 {
+		return bytesRead, io.EOF
+	}
+	return bytesRead, nil
+}
+
 func TestGenerate(t *testing.T) {
 	// The input data has size dataSize. It starts with the data in startWith,
 	// and all other bytes are zeroes.
 	testCases := []struct {
 		data         []byte
-		expectedRoot []byte
+		expectedHash []byte
 	}{
 		{
 			data:         bytes.Repeat([]byte{0}, usermem.PageSize),
-			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
+			expectedHash: []byte{64, 253, 58, 72, 192, 131, 82, 184, 193, 33, 108, 142, 43, 46, 179, 134, 244, 21, 29, 190, 14, 39, 66, 129, 6, 46, 200, 211, 30, 247, 191, 252},
 		},
 		{
 			data:         bytes.Repeat([]byte{0}, 128*usermem.PageSize+1),
-			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
+			expectedHash: []byte{182, 223, 218, 62, 65, 185, 160, 219, 93, 119, 186, 88, 205, 32, 122, 231, 173, 72, 78, 76, 65, 57, 177, 146, 159, 39, 44, 123, 230, 156, 97, 26},
 		},
 		{
 			data:         []byte{'a'},
-			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
+			expectedHash: []byte{28, 201, 8, 36, 150, 178, 111, 5, 193, 212, 129, 205, 206, 124, 211, 90, 224, 142, 81, 183, 72, 165, 243, 240, 242, 241, 76, 127, 101, 61, 63, 11},
 		},
 		{
 			data:         bytes.Repeat([]byte{'a'}, usermem.PageSize),
-			expectedRoot: []byte{201, 62, 238, 45, 13, 176, 47, 16, 172, 199, 70, 13, 149, 118, 225, 34, 220, 248, 205, 83, 196, 191, 141, 252, 174, 27, 62, 116, 235, 207, 255, 90},
+			expectedHash: []byte{106, 58, 160, 152, 41, 68, 38, 108, 245, 74, 177, 84, 64, 193, 19, 176, 249, 86, 27, 193, 85, 164, 99, 240, 79, 104, 148, 222, 76, 46, 191, 79},
 		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("%d:%v", len(tc.data), tc.data[0]), func(t *testing.T) {
-			var tree bytes.Buffer
-
-			root, err := Generate(bytes.NewBuffer(tc.data), int64(len(tc.data)), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
-			}
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				params := GenerateParams{
+					Size:                  int64(len(tc.data)),
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					TreeReader:            &tree,
+					TreeWriter:            &tree,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if dataAndTreeInSameFile {
+					tree.Write(tc.data)
+					params.File = &tree
+				} else {
+					params.File = &bytesReadWriter{
+						bytes: tc.data,
+					}
+				}
+				hash, err := Generate(&params)
+				if err != nil {
+					t.Fatalf("Got err: %v, want nil", err)
+				}
 
-			if !bytes.Equal(root, tc.expectedRoot) {
-				t.Errorf("Unexpected root")
+				if !bytes.Equal(hash, tc.expectedHash) {
+					t.Errorf("Got hash: %v, want %v", hash, tc.expectedHash)
+				}
 			}
 		})
 	}
 }
 
-// bytesReadWriter is used to read from/write to/seek in a byte array. Unlike
-// bytes.Buffer, it keeps the whole buffer during read so that it can be reused.
-type bytesReadWriter struct {
-	// bytes contains the underlying byte array.
-	bytes []byte
-	// readPos is the currently location for Read. Write always appends to
-	// the end of the array.
-	readPos int
-}
-
-func (brw *bytesReadWriter) Write(p []byte) (int, error) {
-	brw.bytes = append(brw.bytes, p...)
-	return len(p), nil
-}
-
-func (brw *bytesReadWriter) Read(p []byte) (int, error) {
-	if brw.readPos >= len(brw.bytes) {
-		return 0, io.EOF
-	}
-	bytesRead := copy(p, brw.bytes[brw.readPos:])
-	brw.readPos += bytesRead
-	if bytesRead < len(p) {
-		return bytesRead, io.EOF
-	}
-	return bytesRead, nil
-}
-
-func (brw *bytesReadWriter) Seek(offset int64, whence int) (int64, error) {
-	off := offset
-	if whence == io.SeekCurrent {
-		off += int64(brw.readPos)
-	}
-	if whence == io.SeekEnd {
-		off += int64(len(brw.bytes))
-	}
-	if off < 0 {
-		panic("seek with negative offset")
-	}
-	if off >= int64(len(brw.bytes)) {
-		return 0, io.EOF
-	}
-	brw.readPos = int(off)
-	return off, nil
-}
-
 func TestVerify(t *testing.T) {
 	// The input data has size dataSize. The portion to be verified ranges from
 	// verifyStart with verifySize. A bit is flipped in outOfRangeByteIndex to
@@ -165,6 +188,10 @@ func TestVerify(t *testing.T) {
 		// modified byte falls in verification range, Verify should
 		// fail, otherwise Verify should still succeed.
 		modifyByte    int64
+		modifyName    bool
+		modifyMode    bool
+		modifyUID     bool
+		modifyGID     bool
 		shouldSucceed bool
 	}{
 		// Verify range start outside the data range should fail.
@@ -193,12 +220,48 @@ func TestVerify(t *testing.T) {
 			modifyByte:    0,
 			shouldSucceed: false,
 		},
-		// Invalid verify range (0 size) should fail.
+		// 0 verify size should only verify metadata.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			shouldSucceed: true,
+		},
+		// Modified name should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyName:    true,
+			shouldSucceed: false,
+		},
+		// Modified mode should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyMode:    true,
+			shouldSucceed: false,
+		},
+		// Modified UID should fail verification.
 		{
 			dataSize:      usermem.PageSize,
 			verifyStart:   0,
 			verifySize:    0,
 			modifyByte:    0,
+			modifyUID:     true,
+			shouldSucceed: false,
+		},
+		// Modified GID should fail verification.
+		{
+			dataSize:      usermem.PageSize,
+			verifyStart:   0,
+			verifySize:    0,
+			modifyByte:    0,
+			modifyGID:     true,
 			shouldSucceed: false,
 		},
 		// The test cases below use a block-aligned verify range.
@@ -284,26 +347,79 @@ func TestVerify(t *testing.T) {
 			data := make([]byte, tc.dataSize)
 			// Generate random bytes in data.
 			rand.Read(data)
-			var tree bytesReadWriter
 
-			root, err := Generate(bytes.NewBuffer(data), int64(tc.dataSize), &tree, &tree)
-			if err != nil {
-				t.Fatalf("Generate failed: %v", err)
-			}
+			for _, dataAndTreeInSameFile := range []bool{false, true} {
+				var tree bytesReadWriter
+				genParams := GenerateParams{
+					Size:                  int64(len(data)),
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					TreeReader:            &tree,
+					TreeWriter:            &tree,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if dataAndTreeInSameFile {
+					tree.Write(data)
+					genParams.File = &tree
+				} else {
+					genParams.File = &bytesReadWriter{
+						bytes: data,
+					}
+				}
+				hash, err := Generate(&genParams)
+				if err != nil {
+					t.Fatalf("Generate failed: %v", err)
+				}
 
-			// Flip a bit in data and checks Verify results.
-			var buf bytes.Buffer
-			data[tc.modifyByte] ^= 1
-			if tc.shouldSucceed {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err != nil && err != io.EOF {
-					t.Errorf("Verification failed when expected to succeed: %v", err)
+				// Flip a bit in data and checks Verify results.
+				var buf bytes.Buffer
+				data[tc.modifyByte] ^= 1
+				verifyParams := VerifyParams{
+					Out:                   &buf,
+					File:                  bytes.NewReader(data),
+					Tree:                  &tree,
+					Size:                  tc.dataSize,
+					Name:                  defaultName,
+					Mode:                  defaultMode,
+					UID:                   defaultUID,
+					GID:                   defaultGID,
+					ReadOffset:            tc.verifyStart,
+					ReadSize:              tc.verifySize,
+					Expected:              hash,
+					DataAndTreeInSameFile: dataAndTreeInSameFile,
+				}
+				if tc.modifyName {
+					verifyParams.Name = defaultName + "abc"
+				}
+				if tc.modifyMode {
+					verifyParams.Mode = defaultMode + 1
 				}
-				if int64(buf.Len()) != tc.verifySize || !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
-					t.Errorf("Incorrect output from Verify")
+				if tc.modifyUID {
+					verifyParams.UID = defaultUID + 1
 				}
-			} else {
-				if err := Verify(&buf, bytes.NewReader(data), &tree, tc.dataSize, tc.verifyStart, tc.verifySize, root); err == nil {
-					t.Errorf("Verification succeeded when expected to fail")
+				if tc.modifyGID {
+					verifyParams.GID = defaultGID + 1
+				}
+				if tc.shouldSucceed {
+					n, err := Verify(&verifyParams)
+					if err != nil && err != io.EOF {
+						t.Errorf("Verification failed when expected to succeed: %v", err)
+					}
+					if n != tc.verifySize {
+						t.Errorf("Got Verify output size %d, want %d", n, tc.verifySize)
+					}
+					if int64(buf.Len()) != tc.verifySize {
+						t.Errorf("Got Verify output buf size %d, want %d,", buf.Len(), tc.verifySize)
+					}
+					if !bytes.Equal(data[tc.verifyStart:tc.verifyStart+tc.verifySize], buf.Bytes()) {
+						t.Errorf("Incorrect output buf from Verify")
+					}
+				} else {
+					if _, err := Verify(&verifyParams); err == nil {
+						t.Errorf("Verification succeeded when expected to fail")
+					}
 				}
 			}
 		})
@@ -318,36 +434,88 @@ func TestVerifyRandom(t *testing.T) {
 	data := make([]byte, dataSize)
 	// Generate random bytes in data.
 	rand.Read(data)
-	var tree bytesReadWriter
 
-	root, err := Generate(bytes.NewBuffer(data), int64(dataSize), &tree, &tree)
-	if err != nil {
-		t.Fatalf("Generate failed: %v", err)
-	}
+	for _, dataAndTreeInSameFile := range []bool{false, true} {
+		var tree bytesReadWriter
+		genParams := GenerateParams{
+			Size:                  int64(len(data)),
+			Name:                  defaultName,
+			Mode:                  defaultMode,
+			UID:                   defaultUID,
+			GID:                   defaultGID,
+			TreeReader:            &tree,
+			TreeWriter:            &tree,
+			DataAndTreeInSameFile: dataAndTreeInSameFile,
+		}
 
-	// Pick a random portion of data.
-	start := rand.Int63n(dataSize - 1)
-	size := rand.Int63n(dataSize) + 1
+		if dataAndTreeInSameFile {
+			tree.Write(data)
+			genParams.File = &tree
+		} else {
+			genParams.File = &bytesReadWriter{
+				bytes: data,
+			}
+		}
+		hash, err := Generate(&genParams)
+		if err != nil {
+			t.Fatalf("Generate failed: %v", err)
+		}
 
-	var buf bytes.Buffer
-	// Checks that the random portion of data from the original data is
-	// verified successfully.
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err != nil && err != io.EOF {
-		t.Errorf("Verification failed for correct data: %v", err)
-	}
-	if size > dataSize-start {
-		size = dataSize - start
-	}
-	if int64(buf.Len()) != size || !bytes.Equal(data[start:start+size], buf.Bytes()) {
-		t.Errorf("Incorrect output from Verify")
-	}
+		// Pick a random portion of data.
+		start := rand.Int63n(dataSize - 1)
+		size := rand.Int63n(dataSize) + 1
+
+		var buf bytes.Buffer
+		verifyParams := VerifyParams{
+			Out:                   &buf,
+			File:                  bytes.NewReader(data),
+			Tree:                  &tree,
+			Size:                  dataSize,
+			Name:                  defaultName,
+			Mode:                  defaultMode,
+			UID:                   defaultUID,
+			GID:                   defaultGID,
+			ReadOffset:            start,
+			ReadSize:              size,
+			Expected:              hash,
+			DataAndTreeInSameFile: dataAndTreeInSameFile,
+		}
+
+		// Checks that the random portion of data from the original data is
+		// verified successfully.
+		n, err := Verify(&verifyParams)
+		if err != nil && err != io.EOF {
+			t.Errorf("Verification failed for correct data: %v", err)
+		}
+		if size > dataSize-start {
+			size = dataSize - start
+		}
+		if n != size {
+			t.Errorf("Got Verify output size %d, want %d", n, size)
+		}
+		if int64(buf.Len()) != size {
+			t.Errorf("Got Verify output buf size %d, want %d", buf.Len(), size)
+		}
+		if !bytes.Equal(data[start:start+size], buf.Bytes()) {
+			t.Errorf("Incorrect output buf from Verify")
+		}
+
+		// Verify that modified metadata should fail verification.
+		buf.Reset()
+		verifyParams.Name = defaultName + "abc"
+		if _, err := Verify(&verifyParams); err == nil {
+			t.Error("Verify succeeded for modified metadata, expect failure")
+		}
 
-	buf.Reset()
-	// Flip a random bit in randPortion, and check that verification fails.
-	randBytePos := rand.Int63n(size)
-	data[start+randBytePos] ^= 1
+		// Flip a random bit in randPortion, and check that verification fails.
+		buf.Reset()
+		randBytePos := rand.Int63n(size)
+		data[start+randBytePos] ^= 1
+		verifyParams.File = bytes.NewReader(data)
+		verifyParams.Name = defaultName
 
-	if err := Verify(&buf, bytes.NewReader(data), &tree, dataSize, start, size, root); err == nil {
-		t.Errorf("Verification succeeded for modified data")
+		if _, err := Verify(&verifyParams); err == nil {
+			t.Error("Verification succeeded for modified data, expect failure")
+		}
 	}
 }
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 58305009d..0a6a5d215 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -27,6 +27,6 @@ go_test(
     deps = [
         ":metric_go_proto",
         "//pkg/eventchannel",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 64aa365ce..d012c5734 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -106,8 +106,8 @@ type customUint64Metric struct {
 // after Initialized.
 //
 // Preconditions:
-//  * name must be globally unique.
-//  * Initialize/Disable have not been called.
+// * name must be globally unique.
+// * Initialize/Disable have not been called.
 func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func() uint64) error {
 	if initialized {
 		return ErrInitializationDone
@@ -221,7 +221,7 @@ var (
 // EmitMetricUpdate is thread-safe.
 //
 // Preconditions:
-//  * Initialize has been called.
+// * Initialize has been called.
 func EmitMetricUpdate() {
 	emitMu.Lock()
 	defer emitMu.Unlock()
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index c425ea532..aefd0ea5c 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -17,7 +17,7 @@ package metric
 import (
 	"testing"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
 )
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 2ee07b664..28fe081d6 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -54,6 +54,8 @@ func (c *Client) newFile(fid FID) *clientFile {
 //
 // This proxies all of the interfaces found in file.go.
 type clientFile struct {
+	DisallowServerCalls
+
 	// client is the originating client.
 	client *Client
 
@@ -283,6 +285,39 @@ func (c *clientFile) Close() error {
 	return nil
 }
 
+// SetAttrClose implements File.SetAttrClose.
+func (c *clientFile) SetAttrClose(valid SetAttrMask, attr SetAttr) error {
+	if !versionSupportsTsetattrclunk(c.client.version) {
+		setAttrErr := c.SetAttr(valid, attr)
+
+		// Try to close file even in case of failure above. Since the state of the
+		// file is unknown to the caller, it will not attempt to close the file
+		// again.
+		if err := c.Close(); err != nil {
+			return err
+		}
+
+		return setAttrErr
+	}
+
+	// Avoid double close.
+	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
+		return syscall.EBADF
+	}
+
+	// Send the message.
+	if err := c.client.sendRecv(&Tsetattrclunk{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattrclunk{}); err != nil {
+		// If an error occurred, we toss away the FID. This isn't ideal,
+		// but I'm not sure what else makes sense in this context.
+		log.Warningf("Tsetattrclunk failed, losing FID %v: %v", c.fid, err)
+		return err
+	}
+
+	// Return the FID to the pool.
+	c.client.fidPool.Put(uint64(c.fid))
+	return nil
+}
+
 // Open implements File.Open.
 func (c *clientFile) Open(flags OpenFlags) (*fd.FD, QID, uint32, error) {
 	if atomic.LoadUint32(&c.closed) != 0 {
@@ -681,6 +716,3 @@ func (c *clientFile) Flush() error {
 
 	return c.client.sendRecv(&Tflushf{FID: c.fid}, &Rflushf{})
 }
-
-// Renamed implements File.Renamed.
-func (c *clientFile) Renamed(newDir File, newName string) {}
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index cab35896f..c2e3a3f98 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -135,6 +135,14 @@ type File interface {
 	// On the server, Close has no concurrency guarantee.
 	Close() error
 
+	// SetAttrClose is the equivalent of calling SetAttr() followed by Close().
+	// This can be used to set file times before closing the file in a single
+	// operation.
+	//
+	// On the server, SetAttr has a write concurrency guarantee.
+	// On the server, Close has no concurrency guarantee.
+	SetAttrClose(valid SetAttrMask, attr SetAttr) error
+
 	// Open must be called prior to using Read, Write or Readdir. Once Open
 	// is called, some operations, such as Walk, will no longer work.
 	//
@@ -286,3 +294,19 @@ type DefaultWalkGetAttr struct{}
 func (DefaultWalkGetAttr) WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error) {
 	return nil, nil, AttrMask{}, Attr{}, syscall.ENOSYS
 }
+
+// DisallowClientCalls panics if a client-only function is called.
+type DisallowClientCalls struct{}
+
+// SetAttrClose implements File.SetAttrClose.
+func (DisallowClientCalls) SetAttrClose(SetAttrMask, SetAttr) error {
+	panic("SetAttrClose should not be called on the server")
+}
+
+// DisallowServerCalls panics if a server-only function is called.
+type DisallowServerCalls struct{}
+
+// Renamed implements File.Renamed.
+func (*clientFile) Renamed(File, string) {
+	panic("Renamed should not be called on the client")
+}
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 1db5797dd..abd237f46 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -123,6 +123,37 @@ func (t *Tclunk) handle(cs *connState) message {
 	return &Rclunk{}
 }
 
+func (t *Tsetattrclunk) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	setAttrErr := ref.safelyWrite(func() error {
+		// We don't allow setattr on files that have been deleted.
+		// This might be technically incorrect, as it's possible that
+		// there were multiple links and you can still change the
+		// corresponding inode information.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Set the attributes.
+		return ref.file.SetAttr(t.Valid, t.SetAttr)
+	})
+
+	// Try to delete FID even in case of failure above. Since the state of the
+	// file is unknown to the caller, it will not attempt to close the file again.
+	if !cs.DeleteFID(t.FID) {
+		return newErr(syscall.EBADF)
+	}
+	if setAttrErr != nil {
+		return newErr(setAttrErr)
+	}
+	return &Rsetattrclunk{}
+}
+
 // handle implements handler.handle.
 func (t *Tremove) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.FID)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 2cb59f934..cf13cbb69 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -317,6 +317,64 @@ func (r *Rclunk) String() string {
 	return "Rclunk{}"
 }
 
+// Tsetattrclunk is a setattr+close request.
+type Tsetattrclunk struct {
+	// FID is the FID to change.
+	FID FID
+
+	// Valid is the set of bits which will be used.
+	Valid SetAttrMask
+
+	// SetAttr is the set request.
+	SetAttr SetAttr
+}
+
+// decode implements encoder.decode.
+func (t *Tsetattrclunk) decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
+}
+
+// encode implements encoder.encode.
+func (t *Tsetattrclunk) encode(b *buffer) {
+	b.WriteFID(t.FID)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
+}
+
+// Type implements message.Type.
+func (*Tsetattrclunk) Type() MsgType {
+	return MsgTsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (t *Tsetattrclunk) String() string {
+	return fmt.Sprintf("Tsetattrclunk{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr)
+}
+
+// Rsetattrclunk is a setattr+close response.
+type Rsetattrclunk struct {
+}
+
+// decode implements encoder.decode.
+func (*Rsetattrclunk) decode(*buffer) {
+}
+
+// encode implements encoder.encode.
+func (*Rsetattrclunk) encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rsetattrclunk) Type() MsgType {
+	return MsgRsetattrclunk
+}
+
+// String implements fmt.Stringer.
+func (r *Rsetattrclunk) String() string {
+	return "Rsetattrclunk{}"
+}
+
 // Tremove is a remove request.
 //
 // This will eventually be replaced by Tunlinkat.
@@ -2657,6 +2715,8 @@ func init() {
 	msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
 	msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
 	msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
+	msgRegistry.register(MsgTsetattrclunk, func() message { return &Tsetattrclunk{} })
+	msgRegistry.register(MsgRsetattrclunk, func() message { return &Rsetattrclunk{} })
 	msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} })
 	msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} })
 }
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 7facc9f5e..bfeb6c236 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -376,6 +376,30 @@ func TestEncodeDecode(t *testing.T) {
 		&Rumknod{
 			Rmknod{QID: QID{Type: 1}},
 		},
+		&Tsetattrclunk{
+			FID: 1,
+			Valid: SetAttrMask{
+				Permissions:        true,
+				UID:                true,
+				GID:                true,
+				Size:               true,
+				ATime:              true,
+				MTime:              true,
+				CTime:              true,
+				ATimeNotSystemTime: true,
+				MTimeNotSystemTime: true,
+			},
+			SetAttr: SetAttr{
+				Permissions:      1,
+				UID:              2,
+				GID:              3,
+				Size:             4,
+				ATimeSeconds:     5,
+				ATimeNanoSeconds: 6,
+				MTimeSeconds:     7,
+				MTimeNanoSeconds: 8,
+			},
+		},
 	}
 
 	for _, enc := range objs {
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 122c457d2..2235f8968 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -315,86 +315,88 @@ type MsgType uint8
 
 // MsgType declarations.
 const (
-	MsgTlerror      MsgType = 6
-	MsgRlerror              = 7
-	MsgTstatfs              = 8
-	MsgRstatfs              = 9
-	MsgTlopen               = 12
-	MsgRlopen               = 13
-	MsgTlcreate             = 14
-	MsgRlcreate             = 15
-	MsgTsymlink             = 16
-	MsgRsymlink             = 17
-	MsgTmknod               = 18
-	MsgRmknod               = 19
-	MsgTrename              = 20
-	MsgRrename              = 21
-	MsgTreadlink            = 22
-	MsgRreadlink            = 23
-	MsgTgetattr             = 24
-	MsgRgetattr             = 25
-	MsgTsetattr             = 26
-	MsgRsetattr             = 27
-	MsgTlistxattr           = 28
-	MsgRlistxattr           = 29
-	MsgTxattrwalk           = 30
-	MsgRxattrwalk           = 31
-	MsgTxattrcreate         = 32
-	MsgRxattrcreate         = 33
-	MsgTgetxattr            = 34
-	MsgRgetxattr            = 35
-	MsgTsetxattr            = 36
-	MsgRsetxattr            = 37
-	MsgTremovexattr         = 38
-	MsgRremovexattr         = 39
-	MsgTreaddir             = 40
-	MsgRreaddir             = 41
-	MsgTfsync               = 50
-	MsgRfsync               = 51
-	MsgTlink                = 70
-	MsgRlink                = 71
-	MsgTmkdir               = 72
-	MsgRmkdir               = 73
-	MsgTrenameat            = 74
-	MsgRrenameat            = 75
-	MsgTunlinkat            = 76
-	MsgRunlinkat            = 77
-	MsgTversion             = 100
-	MsgRversion             = 101
-	MsgTauth                = 102
-	MsgRauth                = 103
-	MsgTattach              = 104
-	MsgRattach              = 105
-	MsgTflush               = 108
-	MsgRflush               = 109
-	MsgTwalk                = 110
-	MsgRwalk                = 111
-	MsgTread                = 116
-	MsgRread                = 117
-	MsgTwrite               = 118
-	MsgRwrite               = 119
-	MsgTclunk               = 120
-	MsgRclunk               = 121
-	MsgTremove              = 122
-	MsgRremove              = 123
-	MsgTflushf              = 124
-	MsgRflushf              = 125
-	MsgTwalkgetattr         = 126
-	MsgRwalkgetattr         = 127
-	MsgTucreate             = 128
-	MsgRucreate             = 129
-	MsgTumkdir              = 130
-	MsgRumkdir              = 131
-	MsgTumknod              = 132
-	MsgRumknod              = 133
-	MsgTusymlink            = 134
-	MsgRusymlink            = 135
-	MsgTlconnect            = 136
-	MsgRlconnect            = 137
-	MsgTallocate            = 138
-	MsgRallocate            = 139
-	MsgTchannel             = 250
-	MsgRchannel             = 251
+	MsgTlerror       MsgType = 6
+	MsgRlerror       MsgType = 7
+	MsgTstatfs       MsgType = 8
+	MsgRstatfs       MsgType = 9
+	MsgTlopen        MsgType = 12
+	MsgRlopen        MsgType = 13
+	MsgTlcreate      MsgType = 14
+	MsgRlcreate      MsgType = 15
+	MsgTsymlink      MsgType = 16
+	MsgRsymlink      MsgType = 17
+	MsgTmknod        MsgType = 18
+	MsgRmknod        MsgType = 19
+	MsgTrename       MsgType = 20
+	MsgRrename       MsgType = 21
+	MsgTreadlink     MsgType = 22
+	MsgRreadlink     MsgType = 23
+	MsgTgetattr      MsgType = 24
+	MsgRgetattr      MsgType = 25
+	MsgTsetattr      MsgType = 26
+	MsgRsetattr      MsgType = 27
+	MsgTlistxattr    MsgType = 28
+	MsgRlistxattr    MsgType = 29
+	MsgTxattrwalk    MsgType = 30
+	MsgRxattrwalk    MsgType = 31
+	MsgTxattrcreate  MsgType = 32
+	MsgRxattrcreate  MsgType = 33
+	MsgTgetxattr     MsgType = 34
+	MsgRgetxattr     MsgType = 35
+	MsgTsetxattr     MsgType = 36
+	MsgRsetxattr     MsgType = 37
+	MsgTremovexattr  MsgType = 38
+	MsgRremovexattr  MsgType = 39
+	MsgTreaddir      MsgType = 40
+	MsgRreaddir      MsgType = 41
+	MsgTfsync        MsgType = 50
+	MsgRfsync        MsgType = 51
+	MsgTlink         MsgType = 70
+	MsgRlink         MsgType = 71
+	MsgTmkdir        MsgType = 72
+	MsgRmkdir        MsgType = 73
+	MsgTrenameat     MsgType = 74
+	MsgRrenameat     MsgType = 75
+	MsgTunlinkat     MsgType = 76
+	MsgRunlinkat     MsgType = 77
+	MsgTversion      MsgType = 100
+	MsgRversion      MsgType = 101
+	MsgTauth         MsgType = 102
+	MsgRauth         MsgType = 103
+	MsgTattach       MsgType = 104
+	MsgRattach       MsgType = 105
+	MsgTflush        MsgType = 108
+	MsgRflush        MsgType = 109
+	MsgTwalk         MsgType = 110
+	MsgRwalk         MsgType = 111
+	MsgTread         MsgType = 116
+	MsgRread         MsgType = 117
+	MsgTwrite        MsgType = 118
+	MsgRwrite        MsgType = 119
+	MsgTclunk        MsgType = 120
+	MsgRclunk        MsgType = 121
+	MsgTremove       MsgType = 122
+	MsgRremove       MsgType = 123
+	MsgTflushf       MsgType = 124
+	MsgRflushf       MsgType = 125
+	MsgTwalkgetattr  MsgType = 126
+	MsgRwalkgetattr  MsgType = 127
+	MsgTucreate      MsgType = 128
+	MsgRucreate      MsgType = 129
+	MsgTumkdir       MsgType = 130
+	MsgRumkdir       MsgType = 131
+	MsgTumknod       MsgType = 132
+	MsgRumknod       MsgType = 133
+	MsgTusymlink     MsgType = 134
+	MsgRusymlink     MsgType = 135
+	MsgTlconnect     MsgType = 136
+	MsgRlconnect     MsgType = 137
+	MsgTallocate     MsgType = 138
+	MsgRallocate     MsgType = 139
+	MsgTsetattrclunk MsgType = 140
+	MsgRsetattrclunk MsgType = 141
+	MsgTchannel      MsgType = 250
+	MsgRchannel      MsgType = 251
 )
 
 // QIDType represents the file type for QIDs.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 6e7bb3db2..6e605b14c 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1225,22 +1225,31 @@ func TestOpen(t *testing.T) {
 func TestClose(t *testing.T) {
 	type closeTest struct {
 		name    string
-		closeFn func(backend *Mock, f p9.File)
+		closeFn func(backend *Mock, f p9.File) error
 	}
 
 	cases := []closeTest{
 		{
 			name: "close",
-			closeFn: func(_ *Mock, f p9.File) {
-				f.Close()
+			closeFn: func(_ *Mock, f p9.File) error {
+				return f.Close()
 			},
 		},
 		{
 			name: "remove",
-			closeFn: func(backend *Mock, f p9.File) {
+			closeFn: func(backend *Mock, f p9.File) error {
 				// Allow the rename call in the parent, automatically translated.
 				backend.parent.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Times(1)
-				f.(deprecatedRemover).Remove()
+				return f.(deprecatedRemover).Remove()
+			},
+		},
+		{
+			name: "setAttrClose",
+			closeFn: func(backend *Mock, f p9.File) error {
+				valid := p9.SetAttrMask{ATime: true}
+				attr := p9.SetAttr{ATimeSeconds: 1, ATimeNanoSeconds: 2}
+				backend.EXPECT().SetAttr(valid, attr).Times(1)
+				return f.SetAttrClose(valid, attr)
 			},
 		},
 	}
@@ -1258,7 +1267,9 @@ func TestClose(t *testing.T) {
 				_, backend, f := walkHelper(h, name, root)
 
 				// Close via the prescribed method.
-				tc.closeFn(backend, f)
+				if err := tc.closeFn(backend, f); err != nil {
+					t.Fatalf("closeFn failed: %v", err)
+				}
 
 				// Everything should fail with EBADF.
 				if _, _, err := f.Walk(nil); err != syscall.EBADF {
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 09cde9f5a..8d7168ef5 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 11
+	highestSupportedVersion uint32 = 12
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -173,3 +173,9 @@ func versionSupportsGetSetXattr(v uint32) bool {
 func versionSupportsListRemoveXattr(v uint32) bool {
 	return v >= 11
 }
+
+// versionSupportsTsetattrclunk returns true if version v supports
+// the Tsetattrclunk message.
+func versionSupportsTsetattrclunk(v uint32) bool {
+	return v >= 12
+}
diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 7c622e5d7..a45920040 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index 48ebb5fd1..9d3b0666d 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.16
+// +build !go1.17
 
 #include "textflag.h"
 
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index d9d5e6bcb..699ea8ac3 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -234,6 +234,41 @@ const (
 	LeaksLogTraces
 )
 
+// Set implements flag.Value.
+func (l *LeakMode) Set(v string) error {
+	switch v {
+	case "disabled":
+		*l = NoLeakChecking
+	case "log-names":
+		*l = LeaksLogWarning
+	case "log-traces":
+		*l = LeaksLogTraces
+	default:
+		return fmt.Errorf("invalid ref leak mode %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (l *LeakMode) Get() interface{} {
+	return *l
+}
+
+// String implements flag.Value.
+func (l *LeakMode) String() string {
+	switch *l {
+	case UninitializedLeakChecking:
+		return "uninitialized"
+	case NoLeakChecking:
+		return "disabled"
+	case LeaksLogWarning:
+		return "log-names"
+	case LeaksLogTraces:
+		return "log-traces"
+	}
+	panic(fmt.Sprintf("invalid ref leak mode %d", *l))
+}
+
 // leakMode stores the current mode for the reference leak checker.
 //
 // Values must be one of the LeakMode values.
diff --git a/pkg/refs_vfs2/BUILD b/pkg/refsvfs2/BUILD
index 7b3e10683..245e33d2d 100644
--- a/pkg/refs_vfs2/BUILD
+++ b/pkg/refsvfs2/BUILD
@@ -11,7 +11,7 @@ go_template(
     types = [
         "T",
     ],
-    visibility = ["//pkg/sentry:internal"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
         "//pkg/refs",
@@ -19,8 +19,16 @@ go_template(
 )
 
 go_library(
-    name = "refs_vfs2",
-    srcs = ["refs.go"],
-    visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/context"],
+    name = "refsvfs2",
+    srcs = [
+        "refs.go",
+        "refs_map.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sync",
+    ],
 )
diff --git a/pkg/refs_vfs2/refs.go b/pkg/refsvfs2/refs.go
index 99a074e96..ef8beb659 100644
--- a/pkg/refs_vfs2/refs.go
+++ b/pkg/refsvfs2/refs.go
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package refs_vfs2 defines an interface for a reference-counted object.
-package refs_vfs2
+// Package refsvfs2 defines an interface for a reference-counted object.
+package refsvfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
diff --git a/pkg/refsvfs2/refs_map.go b/pkg/refsvfs2/refs_map.go
new file mode 100644
index 000000000..be75b0cc2
--- /dev/null
+++ b/pkg/refsvfs2/refs_map.go
@@ -0,0 +1,97 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package refsvfs2
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/log"
+	refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// TODO(gvisor.dev/issue/1193): re-enable once kernfs refs are fixed.
+var ignored []string = []string{"kernfs.", "proc.", "sys.", "devpts.", "fuse."}
+
+var (
+	// liveObjects is a global map of reference-counted objects. Objects are
+	// inserted when leak check is enabled, and they are removed when they are
+	// destroyed. It is protected by liveObjectsMu.
+	liveObjects   map[CheckedObject]struct{}
+	liveObjectsMu sync.Mutex
+)
+
+// CheckedObject represents a reference-counted object with an informative
+// leak detection message.
+type CheckedObject interface {
+	// LeakMessage supplies a warning to be printed upon leak detection.
+	LeakMessage() string
+}
+
+func init() {
+	liveObjects = make(map[CheckedObject]struct{})
+}
+
+// LeakCheckEnabled returns whether leak checking is enabled. The following
+// functions should only be called if it returns true.
+func LeakCheckEnabled() bool {
+	return refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking
+}
+
+// Register adds obj to the live object map.
+func Register(obj CheckedObject, typ string) {
+	for _, str := range ignored {
+		if strings.Contains(typ, str) {
+			return
+		}
+	}
+	liveObjectsMu.Lock()
+	if _, ok := liveObjects[obj]; ok {
+		panic(fmt.Sprintf("Unexpected entry in leak checking map: reference %p already added", obj))
+	}
+	liveObjects[obj] = struct{}{}
+	liveObjectsMu.Unlock()
+}
+
+// Unregister removes obj from the live object map.
+func Unregister(obj CheckedObject, typ string) {
+	liveObjectsMu.Lock()
+	defer liveObjectsMu.Unlock()
+	if _, ok := liveObjects[obj]; !ok {
+		for _, str := range ignored {
+			if strings.Contains(typ, str) {
+				return
+			}
+		}
+		panic(fmt.Sprintf("Expected to find entry in leak checking map for reference %p", obj))
+	}
+	delete(liveObjects, obj)
+}
+
+// DoLeakCheck iterates through the live object map and logs a message for each
+// object. It is called once no reference-counted objects should be reachable
+// anymore, at which point anything left in the map is considered a leak.
+func DoLeakCheck() {
+	liveObjectsMu.Lock()
+	defer liveObjectsMu.Unlock()
+	leaked := len(liveObjects)
+	if leaked > 0 {
+		log.Warningf("Leak checking detected %d leaked objects:", leaked)
+		for obj := range liveObjects {
+			log.Warningf(obj.LeakMessage())
+		}
+	}
+}
diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refsvfs2/refs_template.go
index 99c43c065..ec295ef5b 100644
--- a/pkg/refs_vfs2/refs_template.go
+++ b/pkg/refsvfs2/refs_template.go
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package refs_template defines a template that can be used by reference counted
-// objects.
+// Package refs_template defines a template that can be used by reference
+// counted objects. The "owner" template parameter is used in log messages to
+// indicate the type of reference-counted object that exhibited a reference
+// leak. As a result, structs that are embedded in other structs should not use
+// this template, since it will make tracking down leaks more difficult.
 package refs_template
 
 import (
-	"runtime"
+	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/log"
-	refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 )
 
 // T is the type of the reference counted object. It is only used to customize
@@ -50,24 +52,16 @@ type Refs struct {
 	refCount int64
 }
 
-func (r *Refs) finalize() {
-	var note string
-	switch refs_vfs1.GetLeakMode() {
-	case refs_vfs1.NoLeakChecking:
-		return
-	case refs_vfs1.UninitializedLeakChecking:
-		note = "(Leak checker uninitialized): "
-	}
-	if n := r.ReadRefs(); n != 0 {
-		log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n)
+// EnableLeakCheck enables reference leak checking on r.
+func (r *Refs) EnableLeakCheck() {
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(r, fmt.Sprintf("%T", ownerType))
 	}
 }
 
-// EnableLeakCheck checks for reference leaks when Refs gets garbage collected.
-func (r *Refs) EnableLeakCheck() {
-	if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
-		runtime.SetFinalizer(r, (*Refs).finalize)
-	}
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (r *Refs) LeakMessage() string {
+	return fmt.Sprintf("%T %p: reference count of %d instead of 0", ownerType, r, r.ReadRefs())
 }
 
 // ReadRefs returns the current number of references. The returned count is
@@ -82,7 +76,7 @@ func (r *Refs) ReadRefs() int64 {
 //go:nosplit
 func (r *Refs) IncRef() {
 	if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
-		panic("Incrementing non-positive ref count")
+		panic(fmt.Sprintf("Incrementing non-positive count %p on %T", r, ownerType))
 	}
 }
 
@@ -122,12 +116,21 @@ func (r *Refs) TryIncRef() bool {
 func (r *Refs) DecRef(destroy func()) {
 	switch v := atomic.AddInt64(&r.refCount, -1); {
 	case v < -1:
-		panic("Decrementing non-positive ref count")
+		panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ownerType))
 
 	case v == -1:
+		if refsvfs2.LeakCheckEnabled() {
+			refsvfs2.Unregister(r, fmt.Sprintf("%T", ownerType))
+		}
 		// Call the destructor.
 		if destroy != nil {
 			destroy()
 		}
 	}
 }
+
+func (r *Refs) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && r.ReadRefs() > 0 {
+		r.EnableLeakCheck()
+	}
+}
diff --git a/pkg/safemem/BUILD b/pkg/safemem/BUILD
index ce30382ab..68ed074f8 100644
--- a/pkg/safemem/BUILD
+++ b/pkg/safemem/BUILD
@@ -11,9 +11,7 @@ go_library(
         "seq_unsafe.go",
     ],
     visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/safecopy",
-    ],
+    deps = ["//pkg/safecopy"],
 )
 
 go_test(
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index f5f0574f8..fc4049eeb 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -91,9 +91,10 @@ func BlockSeqFromSlice(slice []Block) BlockSeq {
 	return blockSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all Blocks in slice <= limit. If
-// len(slice) != 0, the first Block in slice has non-zero length, and limit >
-// 0.
+// Preconditions:
+// * The combined length of all Blocks in slice <= limit.
+// * If len(slice) != 0, the first Block in slice has non-zero length and
+//   limit > 0.
 func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 29aeaab8c..e828894b0 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -10,6 +10,7 @@ go_binary(
         "seccomp_test_victim_amd64.go",
         "seccomp_test_victim_arm64.go",
     ],
+    nogo = False,
     deps = [":seccomp"],
 )
 
@@ -48,7 +49,7 @@ go_test(
     library = ":seccomp",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/bpf",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 55fd6967e..752e2dc32 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package seccomp provides basic seccomp filters for x86_64 (little endian).
+// Package seccomp provides generation of basic seccomp filters. Currently,
+// only little endian systems are supported.
 package seccomp
 
 import (
@@ -64,9 +65,9 @@ func Install(rules SyscallRules) error {
 			Rules:  rules,
 			Action: linux.SECCOMP_RET_ALLOW,
 		},
-	}, defaultAction)
+	}, defaultAction, defaultAction)
 	if log.IsLogging(log.Debug) {
-		programStr, errDecode := bpf.DecodeProgram(instrs)
+		programStr, errDecode := bpf.DecodeInstructions(instrs)
 		if errDecode != nil {
 			programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr)
 		}
@@ -117,7 +118,7 @@ var SyscallName = func(sysno uintptr) string {
 
 // BuildProgram builds a BPF program from the given map of actions to matching
 // SyscallRules. The single generated program covers all provided RuleSets.
-func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) {
+func BuildProgram(rules []RuleSet, defaultAction, badArchAction linux.BPFAction) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
 
 	// Be paranoid and check that syscall is done in the expected architecture.
@@ -128,7 +129,7 @@ func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFIn
 	// defaultLabel is at the bottom of the program. The size of program
 	// may exceeds 255 lines, which is the limit of a condition jump.
 	program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0)
-	program.AddDirectJumpLabel(defaultLabel)
+	program.AddStmt(bpf.Ret|bpf.K, uint32(badArchAction))
 	if err := buildIndex(rules, program); err != nil {
 		return nil, err
 	}
@@ -144,6 +145,11 @@ func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFIn
 
 // buildIndex builds a BST to quickly search through all syscalls.
 func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
+	// Do nothing if rules is empty.
+	if len(rules) == 0 {
+		return nil
+	}
+
 	// Build a list of all application system calls, across all given rule
 	// sets. We have a simple BST, but may dispatch individual matchers
 	// with different actions. The matchers are evaluated linearly.
@@ -216,42 +222,163 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAc
 		labelled := false
 		for i, arg := range rule {
 			if arg != nil {
+				// Break out early if using MatchAny since no further
+				// instructions are required.
+				if _, ok := arg.(MatchAny); ok {
+					continue
+				}
+
+				// Determine the data offset for low and high bits of input.
+				dataOffsetLow := seccompDataOffsetArgLow(i)
+				dataOffsetHigh := seccompDataOffsetArgHigh(i)
+				if i == RuleIP {
+					dataOffsetLow = seccompDataOffsetIPLow
+					dataOffsetHigh = seccompDataOffsetIPHigh
+				}
+
+				// Add the conditional operation. Input values to the BPF
+				// program are 64bit values.  However, comparisons in BPF can
+				// only be done on 32bit values. This means that we need to do
+				// multiple BPF comparisons in order to do one logical 64bit
+				// comparison.
 				switch a := arg.(type) {
-				case AllowAny:
-				case AllowValue:
-					dataOffsetLow := seccompDataOffsetArgLow(i)
-					dataOffsetHigh := seccompDataOffsetArgHigh(i)
-					if i == RuleIP {
-						dataOffsetLow = seccompDataOffsetIPLow
-						dataOffsetHigh = seccompDataOffsetIPHigh
-					}
+				case EqualTo:
+					// EqualTo checks that both the higher and lower 32bits are equal.
 					high, low := uint32(a>>32), uint32(a)
-					// assert arg_low == low
+
+					// Assert that the lower 32bits are equal.
+					// arg_low == low ? continue : violation
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
-					// assert arg_high == high
+
+					// Assert that the lower 32bits are also equal.
+					// arg_high == high ? continue/success : violation
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
+				case NotEqual:
+					// NotEqual checks that either the higher or lower 32bits
+					// are *not* equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("ne%v", i)
+
+					// Check if the higher 32bits are (not) equal.
+					// arg_low == low ? continue : success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are not equal (assuming
+					// higher bits are equal).
+					// arg_high == high ? violation : continue/success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
 				case GreaterThan:
-					dataOffsetLow := seccompDataOffsetArgLow(i)
-					dataOffsetHigh := seccompDataOffsetArgHigh(i)
-					if i == RuleIP {
-						dataOffsetLow = seccompDataOffsetIPLow
-						dataOffsetHigh = seccompDataOffsetIPHigh
-					}
-					labelGood := fmt.Sprintf("gt%v", i)
+					// GreaterThan checks that the higher 32bits is greater
+					// *or* that the higher 32bits are equal and the lower
+					// 32bits are greater.
 					high, low := uint32(a>>32), uint32(a)
-					// assert arg_high < high
+					labelGood := fmt.Sprintf("gt%v", i)
+
+					// Assert the higher 32bits are greater than or equal.
+					// arg_high >= high ? continue : violation (arg_high < high)
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
-					// arg_high > high
+
+					// Assert that the lower 32bits are greater.
+					// arg_high == high ? continue : success (arg_high > high)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
-					// arg_low < low
+					// arg_low > low ? continue/success : violation (arg_high == high and arg_low <= low)
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
 					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
 					labelled = true
+				case GreaterThanOrEqual:
+					// GreaterThanOrEqual checks that the higher 32bits is
+					// greater *or* that the higher 32bits are equal and the
+					// lower 32bits are greater than or equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("ge%v", i)
+
+					// Assert the higher 32bits are greater than or equal.
+					// arg_high >= high ? continue : violation (arg_high < high)
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					// arg_high == high ? continue : success (arg_high > high)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are greater (assuming the
+					// higher bits are equal).
+					// arg_low >= low ? continue/success : violation (arg_high == high and arg_low < low)
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case LessThan:
+					// LessThan checks that the higher 32bits is less *or* that
+					// the higher 32bits are equal and the lower 32bits are
+					// less.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("lt%v", i)
+
+					// Assert the higher 32bits are less than or equal.
+					// arg_high > high ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					// arg_high == high ? continue : success (arg_high < high)
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert that the lower 32bits are less (assuming the
+					// higher bits are equal).
+					// arg_low >= low ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jge|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case LessThanOrEqual:
+					// LessThan checks that the higher 32bits is less *or* that
+					// the higher 32bits are equal and the lower 32bits are
+					// less than or equal.
+					high, low := uint32(a>>32), uint32(a)
+					labelGood := fmt.Sprintf("le%v", i)
+
+					// Assert the higher 32bits are less than or equal.
+					// assert arg_high > high ? violation : continue
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, high, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					// arg_high == high ? continue : success
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+
+					// Assert the lower bits are less than or equal (assuming
+					// the higher bits are equal).
+					// arg_low > low ? violation : success
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					p.AddJumpTrueLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, ruleViolationLabel(ruleSetIdx, sysno, ruleidx), 0)
+					p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood))
+					labelled = true
+				case maskedEqual:
+					// MaskedEqual checks that the bitwise AND of the value and
+					// mask are equal for both the higher and lower 32bits.
+					high, low := uint32(a.value>>32), uint32(a.value)
+					maskHigh, maskLow := uint32(a.mask>>32), uint32(a.mask)
+
+					// Assert that the lower 32bits are equal when masked.
+					// A <- arg_low.
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow)
+					// A <- arg_low & maskLow
+					p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskLow)
+					// Assert that arg_low & maskLow == low.
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+
+					// Assert that the higher 32bits are equal when masked.
+					// A <- arg_high
+					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh)
+					// A <- arg_high & maskHigh
+					p.AddStmt(bpf.Alu|bpf.And|bpf.K, maskHigh)
+					// Assert that arg_high & maskHigh == high.
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
+					labelled = true
 				default:
 					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
 				}
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index a52dc1b4e..daf165bbf 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -39,28 +39,79 @@ func seccompDataOffsetArgHigh(i int) uint32 {
 	return seccompDataOffsetArgLow(i) + 4
 }
 
-// AllowAny is marker to indicate any value will be accepted.
-type AllowAny struct{}
+// MatchAny is marker to indicate any value will be accepted.
+type MatchAny struct{}
 
-func (a AllowAny) String() (s string) {
+func (a MatchAny) String() (s string) {
 	return "*"
 }
 
-// AllowValue specifies a value that needs to be strictly matched.
-type AllowValue uintptr
+// EqualTo specifies a value that needs to be strictly matched.
+type EqualTo uintptr
+
+func (a EqualTo) String() (s string) {
+	return fmt.Sprintf("== %#x", uintptr(a))
+}
+
+// NotEqual specifies a value that is strictly not equal.
+type NotEqual uintptr
+
+func (a NotEqual) String() (s string) {
+	return fmt.Sprintf("!= %#x", uintptr(a))
+}
 
 // GreaterThan specifies a value that needs to be strictly smaller.
 type GreaterThan uintptr
 
-func (a AllowValue) String() (s string) {
-	return fmt.Sprintf("%#x ", uintptr(a))
+func (a GreaterThan) String() (s string) {
+	return fmt.Sprintf("> %#x", uintptr(a))
+}
+
+// GreaterThanOrEqual specifies a value that needs to be smaller or equal.
+type GreaterThanOrEqual uintptr
+
+func (a GreaterThanOrEqual) String() (s string) {
+	return fmt.Sprintf(">= %#x", uintptr(a))
+}
+
+// LessThan specifies a value that needs to be strictly greater.
+type LessThan uintptr
+
+func (a LessThan) String() (s string) {
+	return fmt.Sprintf("< %#x", uintptr(a))
+}
+
+// LessThanOrEqual specifies a value that needs to be greater or equal.
+type LessThanOrEqual uintptr
+
+func (a LessThanOrEqual) String() (s string) {
+	return fmt.Sprintf("<= %#x", uintptr(a))
+}
+
+type maskedEqual struct {
+	mask  uintptr
+	value uintptr
+}
+
+func (a maskedEqual) String() (s string) {
+	return fmt.Sprintf("& %#x == %#x", a.mask, a.value)
+}
+
+// MaskedEqual specifies a value that matches the input after the input is
+// masked (bitwise &) against the given mask. Can be used to verify that input
+// only includes certain approved flags.
+func MaskedEqual(mask, value uintptr) interface{} {
+	return maskedEqual{
+		mask:  mask,
+		value: value,
+	}
 }
 
 // Rule stores the allowed syscall arguments.
 //
 // For example:
 // rule := Rule {
-//       AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
+//       EqualTo(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0
 // }
 type Rule [7]interface{} // 6 arguments + RIP
 
@@ -89,12 +140,12 @@ func (r Rule) String() (s string) {
 //  rules := SyscallRules{
 //         syscall.SYS_FUTEX: []Rule{
 //                 {
-//                         AllowAny{},
-//                         AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+//                         MatchAny{},
+//                         EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
 //                 }, // OR
 //                 {
-//                         AllowAny{},
-//                         AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+//                         MatchAny{},
+//                         EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
 //                 },
 //         },
 //         syscall.SYS_GETPID: []Rule{},
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 5238df8bd..e1444d18b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -28,17 +28,10 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-type seccompData struct {
-	nr                 uint32
-	arch               uint32
-	instructionPointer uint64
-	args               [6]uint64
-}
-
 // newVictim makes a victim binary.
 func newVictim() (string, error) {
 	f, err := ioutil.TempFile("", "victim")
@@ -58,9 +51,14 @@ func newVictim() (string, error) {
 	return path, nil
 }
 
-// asInput converts a seccompData to a bpf.Input.
-func (d *seccompData) asInput() bpf.Input {
-	return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+// dataAsInput converts a linux.SeccompData to a bpf.Input.
+func dataAsInput(d *linux.SeccompData) bpf.Input {
+	buf := make([]byte, d.SizeBytes())
+	d.MarshalUnsafe(buf)
+	return bpf.InputBytes{
+		Data:  buf,
+		Order: usermem.ByteOrder,
+	}
 }
 
 func TestBasic(t *testing.T) {
@@ -69,18 +67,21 @@ func TestBasic(t *testing.T) {
 		desc string
 
 		// data is the input data.
-		data seccompData
+		data linux.SeccompData
 
 		// want is the expected return value of the BPF program.
 		want linux.BPFAction
 	}
 
 	for _, test := range []struct {
+		name          string
 		ruleSets      []RuleSet
 		defaultAction linux.BPFAction
+		badArchAction linux.BPFAction
 		specs         []spec
 	}{
 		{
+			name: "Single syscall",
 			ruleSets: []RuleSet{
 				{
 					Rules:  SyscallRules{1: {}},
@@ -88,26 +89,28 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Single syscall allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Single syscall disallowed",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall disallowed",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Multiple rulesets",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0x1),
+								EqualTo(0x1),
 							},
 						},
 					},
@@ -122,30 +125,32 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_KILL_THREAD,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Multiple rulesets allowed (1a)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x1}},
+					desc: "allowed (1a)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple rulesets allowed (1b)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (1b)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple rulesets allowed (2)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "syscall 1 matched 2nd rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple rulesets allowed (2)",
-					data: seccompData{nr: 0, arch: LINUX_AUDIT_ARCH},
+					desc: "no match",
+					data: linux.SeccompData{Nr: 0, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
 		{
+			name: "Multiple syscalls",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -157,50 +162,52 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Multiple syscalls allowed (1)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (1)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls allowed (3)",
-					data: seccompData{nr: 3, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (3)",
+					data: linux.SeccompData{Nr: 3, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls allowed (5)",
-					data: seccompData{nr: 5, arch: LINUX_AUDIT_ARCH},
+					desc: "allowed (5)",
+					data: linux.SeccompData{Nr: 5, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Multiple syscalls disallowed (0)",
-					data: seccompData{nr: 0, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (0)",
+					data: linux.SeccompData{Nr: 0, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (2)",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (2)",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (4)",
-					data: seccompData{nr: 4, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (4)",
+					data: linux.SeccompData{Nr: 4, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (6)",
-					data: seccompData{nr: 6, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (6)",
+					data: linux.SeccompData{Nr: 6, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Multiple syscalls disallowed (100)",
-					data: seccompData{nr: 100, arch: LINUX_AUDIT_ARCH},
+					desc: "disallowed (100)",
+					data: linux.SeccompData{Nr: 100, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Wrong architecture",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -210,15 +217,17 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Wrong architecture",
-					data: seccompData{nr: 1, arch: 123},
-					want: linux.SECCOMP_RET_TRAP,
+					desc: "arch (123)",
+					data: linux.SeccompData{Nr: 1, Arch: 123},
+					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
 		{
+			name: "Syscall disallowed",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -228,22 +237,24 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall disallowed, action trap",
-					data: seccompData{nr: 2, arch: LINUX_AUDIT_ARCH},
+					desc: "action trap",
+					data: linux.SeccompData{Nr: 2, Arch: LINUX_AUDIT_ARCH},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Syscall arguments",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowAny{},
-								AllowValue(0xf),
+								MatchAny{},
+								EqualTo(0xf),
 							},
 						},
 					},
@@ -251,29 +262,31 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xf}},
+					desc: "allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xf}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Syscall argument disallowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xe}},
+					desc: "disallowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xe}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Multiple arguments",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0xf),
+								EqualTo(0xf),
 							},
 							{
-								AllowValue(0xe),
+								EqualTo(0xe),
 							},
 						},
 					},
@@ -281,28 +294,30 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "Syscall argument allowed, two rules",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf}},
+					desc: "match first rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "Syscall argument allowed, two rules",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xe}},
+					desc: "match 2nd rule",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xe}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 			},
 		},
 		{
+			name: "EqualTo",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								AllowValue(0),
-								AllowValue(math.MaxUint64 - 1),
-								AllowValue(math.MaxUint32),
+								EqualTo(0),
+								EqualTo(math.MaxUint64 - 1),
+								EqualTo(math.MaxUint32),
 							},
 						},
 					},
@@ -310,37 +325,135 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "64bit syscall argument allowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64 - 1, math.MaxUint32},
+					desc: "argument allowed (all match)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64 - 1, math.MaxUint32},
 					},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "64bit syscall argument disallowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64, math.MaxUint32},
+					desc: "argument disallowed (one mismatch)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32},
 					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "64bit syscall argument disallowed",
-					data: seccompData{
-						nr:   1,
-						arch: LINUX_AUDIT_ARCH,
-						args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
+					desc: "argument disallowed (multiple mismatch)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
 					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "NotEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								NotEqual(0x7aabbccdd),
+								NotEqual(math.MaxUint64 - 1),
+								NotEqual(math.MaxUint32),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0, math.MaxUint64, math.MaxUint32 - 1},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (one equal)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0x7aabbccdd, math.MaxUint64, math.MaxUint32 - 1},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (all equal)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						Args: [6]uint64{0x7aabbccdd, math.MaxUint64 - 1, math.MaxUint32},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThan",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								GreaterThan(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThan (multi)",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
@@ -355,46 +468,410 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (first arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xabcd000d}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThanOrEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								GreaterThanOrEqual(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "GreaterThanOrEqual (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								GreaterThanOrEqual(0xf),
+								GreaterThanOrEqual(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed (both greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0xf, 0xffffffff}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg allowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xabcd000d}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (second arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x10, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg smaller)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xa000ffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "LessThan",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								LessThan(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "GreaterThan: Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xffffffff}},
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "GreaterThan: Syscall argument disallowed (equal)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0xf, 0xffffffff}},
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+		},
+		{
+			name: "LessThan (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								LessThan(0x1),
+								LessThan(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1, 0x0}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "Syscall argument disallowed (smaller)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x0, 0xffffffff}},
+					desc: "arg disallowed (first arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0x0}},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xfbcd000d}},
+					desc: "arg disallowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xabcd000d}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "LessThanOrEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// 4294967298
+								// Both upper 32 bits and lower 32 bits are non-zero.
+								// 00000000000000000000000000000010
+								// 00000000000000000000000000000010
+								LessThanOrEqual(0x00000002_00000002),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "high 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000003_00000002}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits greater",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000003}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "high 32bits equal, low 32bits equal",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000002}},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument disallowed (equal)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xabcd000d}},
+					desc: "high 32bits equal, low 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000002_00000001}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "high 32bits less",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x00000001_00000002}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+		},
+
+		{
+			name: "LessThanOrEqual (multi)",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								LessThanOrEqual(0x1),
+								LessThanOrEqual(0xabcd000d),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (first arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x1, 0x0}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (first arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0x0}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg allowed (second arg equal)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xabcd000d}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (second arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x0, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (both arg greater)",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{0x2, 0xffffffff}},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+			},
+		},
+		{
+			name: "MaskedEqual",
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								// x & 00000001 00000011 (0x103) == 00000000 00000001 (0x1)
+								// Input x must have lowest order bit set and
+								// must *not* have 8th or second lowest order bit set.
+								MaskedEqual(0x103, 0x1),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
+			specs: []spec{
+				{
+					desc: "arg allowed (low order mandatory bit)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000001
+						Args: [6]uint64{0x1},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg allowed (low order optional bit)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000101
+						Args: [6]uint64{0x5},
+					},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "arg disallowed (lowest order bit not set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000010
+						Args: [6]uint64{0x2},
+					},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "arg disallowed (second lowest order bit set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000000 00000011
+						Args: [6]uint64{0x3},
+					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 				{
-					desc: "GreaterThan2: Syscall argument disallowed (smaller)",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{0x10, 0xa000ffff}},
+					desc: "arg disallowed (8th bit set)",
+					data: linux.SeccompData{
+						Nr:   1,
+						Arch: LINUX_AUDIT_ARCH,
+						// 00000000 00000000 00000001 00000000
+						Args: [6]uint64{0x100},
+					},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
+			name: "Instruction Pointer",
 			ruleSets: []RuleSet{
 				{
 					Rules: SyscallRules{
 						1: []Rule{
 							{
-								RuleIP: AllowValue(0x7aabbccdd),
+								RuleIP: EqualTo(0x7aabbccdd),
 							},
 						},
 					},
@@ -402,40 +879,42 @@ func TestBasic(t *testing.T) {
 				},
 			},
 			defaultAction: linux.SECCOMP_RET_TRAP,
+			badArchAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
-					desc: "IP: Syscall instruction pointer allowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{}, instructionPointer: 0x7aabbccdd},
+					desc: "allowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{}, InstructionPointer: 0x7aabbccdd},
 					want: linux.SECCOMP_RET_ALLOW,
 				},
 				{
-					desc: "IP: Syscall instruction pointer disallowed",
-					data: seccompData{nr: 1, arch: LINUX_AUDIT_ARCH, args: [6]uint64{}, instructionPointer: 0x711223344},
+					desc: "disallowed",
+					data: linux.SeccompData{Nr: 1, Arch: LINUX_AUDIT_ARCH, Args: [6]uint64{}, InstructionPointer: 0x711223344},
 					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 	} {
-		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
-		if err != nil {
-			t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
-			continue
-		}
-		p, err := bpf.Compile(instrs)
-		if err != nil {
-			t.Errorf("%s: bpf.Compile() got error: %v", test.specs[0].desc, err)
-			continue
-		}
-		for _, spec := range test.specs {
-			got, err := bpf.Exec(p, spec.data.asInput())
+		t.Run(test.name, func(t *testing.T) {
+			instrs, err := BuildProgram(test.ruleSets, test.defaultAction, test.badArchAction)
 			if err != nil {
-				t.Errorf("%s: bpf.Exec() got error: %v", spec.desc, err)
-				continue
+				t.Fatalf("BuildProgram() got error: %v", err)
 			}
-			if got != uint32(spec.want) {
-				t.Errorf("%s: bpd.Exec() = %d, want: %d", spec.desc, got, spec.want)
+			p, err := bpf.Compile(instrs)
+			if err != nil {
+				t.Fatalf("bpf.Compile() got error: %v", err)
 			}
-		}
+			for _, spec := range test.specs {
+				got, err := bpf.Exec(p, dataAsInput(&spec.data))
+				if err != nil {
+					t.Fatalf("%s: bpf.Exec() got error: %v", spec.desc, err)
+				}
+				if got != uint32(spec.want) {
+					// Include a decoded version of the program in output for debugging purposes.
+					decoded, _ := bpf.DecodeInstructions(instrs)
+					t.Fatalf("%s: got: %d, want: %d\nBPF Program\n%s", spec.desc, got, spec.want, decoded)
+				}
+			}
+		})
 	}
 }
 
@@ -457,7 +936,7 @@ func TestRandom(t *testing.T) {
 			Rules:  syscallRules,
 			Action: linux.SECCOMP_RET_ALLOW,
 		},
-	}, linux.SECCOMP_RET_TRAP)
+	}, linux.SECCOMP_RET_TRAP, linux.SECCOMP_RET_KILL_THREAD)
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -466,8 +945,8 @@ func TestRandom(t *testing.T) {
 		t.Fatalf("bpf.Compile() got error: %v", err)
 	}
 	for i := uint32(0); i < 200; i++ {
-		data := seccompData{nr: i, arch: LINUX_AUDIT_ARCH}
-		got, err := bpf.Exec(p, data.asInput())
+		data := linux.SeccompData{Nr: int32(i), Arch: LINUX_AUDIT_ARCH}
+		got, err := bpf.Exec(p, dataAsInput(&data))
 		if err != nil {
 			t.Errorf("bpf.Exec() got error: %v, for syscall %d", err, i)
 			continue
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index fe157f539..7f33e0d9e 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -100,7 +100,7 @@ func main() {
 	if !die {
 		syscalls[syscall.SYS_OPENAT] = []seccomp.Rule{
 			{
-				seccomp.AllowValue(10),
+				seccomp.EqualTo(10),
 			},
 		}
 	}
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 1a17ad9cb..fbb31dbea 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -407,7 +407,9 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator
 // and returns an iterator to the inserted segment. All existing iterators
 // (including gap, but not including the returned iterator) are invalidated.
 //
-// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+// Preconditions:
+// * r.Start >= gap.Start().
+// * r.End <= gap.End().
 func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator {
 	gap = gap.node.rebalanceBeforeInsert(gap)
 	splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get())
@@ -1211,12 +1213,10 @@ func (seg Iterator) End() Key {
 // does not invalidate any iterators.
 //
 // Preconditions:
-//
-// - r.Length() > 0.
-//
-// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
-// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
-// r.start >= seg.PrevSegment().End().
+// * r.Length() > 0.
+// * The new range must not overlap an existing one:
+//   * If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start().
+//   * If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End().
 func (seg Iterator) SetRangeUnchecked(r Range) {
 	seg.node.keys[seg.index] = r
 }
@@ -1241,8 +1241,9 @@ func (seg Iterator) SetRange(r Range) {
 // SetStartUnchecked mutates the iterated segment's start. This operation does
 // not invalidate any iterators.
 //
-// Preconditions: The new start must be valid: start < seg.End(); if
-// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+// Preconditions: The new start must be valid:
+// * start < seg.End()
+// * If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
 func (seg Iterator) SetStartUnchecked(start Key) {
 	seg.node.keys[seg.index].Start = start
 }
@@ -1264,8 +1265,9 @@ func (seg Iterator) SetStart(start Key) {
 // SetEndUnchecked mutates the iterated segment's end. This operation does not
 // invalidate any iterators.
 //
-// Preconditions: The new end must be valid: end > seg.Start(); if
-// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+// Preconditions: The new end must be valid:
+// * end > seg.Start().
+// * If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
 func (seg Iterator) SetEndUnchecked(end Key) {
 	seg.node.keys[seg.index].End = end
 }
@@ -1695,9 +1697,11 @@ func (s *Set) ExportSortedSlices() *SegmentDataSlices {
 
 // ImportSortedSlice initializes the given set from the given slice.
 //
-// Preconditions: s must be empty. sds must represent a valid set (the segments
-// in sds must have valid lengths that do not overlap). The segments in sds
-// must be sorted in ascending key order.
+// Preconditions:
+// * s must be empty.
+// * sds must represent a valid set (the segments in sds must have valid
+//   lengths that do not overlap).
+// * The segments in sds must be sorted in ascending key order.
 func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
 	if !s.IsEmpty() {
 		return fmt.Errorf("cannot import into non-empty set %v", s)
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 901e0f320..4af4d6e84 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -22,6 +22,7 @@ go_library(
         "signal_info.go",
         "signal_stack.go",
         "stack.go",
+        "stack_unsafe.go",
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
@@ -33,11 +34,12 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/limits",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index a903d031c..d75d665ae 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -72,12 +73,12 @@ type Context interface {
 	// with return values of varying sizes (for example ARCH_GETFS). This
 	// is a simple utility function to convert to the native size in these
 	// cases, and then we can CopyOut.
-	Native(val uintptr) interface{}
+	Native(val uintptr) marshal.Marshallable
 
 	// Value converts a native type back to a generic value.
 	// Once a value has been converted to native via the above call -- it
 	// can be converted back here.
-	Value(val interface{}) uintptr
+	Value(val marshal.Marshallable) uintptr
 
 	// Width returns the number of bytes for a native value.
 	Width() uint
@@ -205,7 +206,7 @@ type Context interface {
 	// equivalent of arch_ptrace():
 
 	// PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
-	PtracePeekUser(addr uintptr) (interface{}, error)
+	PtracePeekUser(addr uintptr) (marshal.Marshallable, error)
 
 	// PtracePokeUser implements ptrace(PTRACE_POKEUSR).
 	PtracePokeUser(addr, data uintptr) error
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 0f433ee79..fd73751e7 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -154,6 +154,7 @@ func (s State) Proto() *rpb.Registers {
 		Sp:     s.Regs.Sp,
 		Pc:     s.Regs.Pc,
 		Pstate: s.Regs.Pstate,
+		Tls:    s.Regs.TPIDR_EL0,
 	}
 	return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}}
 }
@@ -232,6 +233,7 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 		"Sp":     uintptr(s.Regs.Sp),
 		"Pc":     uintptr(s.Regs.Pc),
 		"Pstate": uintptr(s.Regs.Pstate),
+		"Tls":    uintptr(s.Regs.TPIDR_EL0),
 	}, nil
 }
 
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 1c3e3c14c..c7d3a206d 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -23,6 +23,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -179,14 +181,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 }
 
 // Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
-	v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+	v := primitive.Uint64(val)
 	return &v
 }
 
 // Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
-	return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+	return uintptr(*val.(*primitive.Uint64))
 }
 
 // Width returns the byte width of this architecture.
@@ -293,7 +295,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
 const userStructSize = 928
 
 // PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
 	if addr&7 != 0 || addr >= userStructSize {
 		return nil, syscall.EIO
 	}
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index 550741d8c..680d23a9f 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -22,6 +22,8 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -163,14 +165,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
 }
 
 // Native returns the native type for the given val.
-func (c *context64) Native(val uintptr) interface{} {
-	v := uint64(val)
+func (c *context64) Native(val uintptr) marshal.Marshallable {
+	v := primitive.Uint64(val)
 	return &v
 }
 
 // Value returns the generic val for the given native type.
-func (c *context64) Value(val interface{}) uintptr {
-	return uintptr(*val.(*uint64))
+func (c *context64) Value(val marshal.Marshallable) uintptr {
+	return uintptr(*val.(*primitive.Uint64))
 }
 
 // Width returns the byte width of this architecture.
@@ -274,7 +276,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
 }
 
 // PtracePeekUser implements Context.PtracePeekUser.
-func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
 	// TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
 	return c.Native(0), nil
 }
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 60c027aab..2727ba08a 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -83,6 +83,7 @@ message ARM64Registers {
   uint64 sp = 32;
   uint64 pc = 33;
   uint64 pstate = 34;
+  uint64 tls = 35;
 }
 message Registers {
   oneof arch {
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 32173aa20..d3e2324a8 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,7 +14,7 @@
 
 package arch
 
-import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+import "gvisor.dev/gvisor/pkg/marshal"
 
 // Special values for SignalAct.Handler.
 const (
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 6fb756f0e..72e07a988 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -17,17 +17,19 @@
 package arch
 
 import (
-	"encoding/binary"
 	"math"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
 // second argument to signal handlers set by signal(2).
+//
+// +marshal
 type SignalContext64 struct {
 	R8      uint64
 	R9      uint64
@@ -68,6 +70,8 @@ const (
 )
 
 // UContext64 is equivalent to ucontext_t on 64-bit x86.
+//
+// +marshal
 type UContext64 struct {
 	Flags    uint64
 	Link     uint64
@@ -172,12 +176,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 
 	// "... the value (%rsp+8) is always a multiple of 16 (...) when
 	// control is transferred to the function entry point." - AMD64 ABI
-	ucSize := binary.Size(uc)
-	if ucSize < 0 {
-		// This can only happen if we've screwed up the definition of
-		// UContext64.
-		panic("can't get size of UContext64")
-	}
+	ucSize := uc.SizeBytes()
 	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
 	frameSize := int(st.Arch.Width()) + ucSize + 128
 	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
@@ -195,18 +194,18 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	info.FixSignalCodeForUser()
 
 	// Set up the stack frame.
-	infoAddr, err := st.Push(info)
-	if err != nil {
+	if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
-	ucAddr, err := st.Push(uc)
-	if err != nil {
+	infoAddr := st.Bottom
+	if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
+	ucAddr := st.Bottom
 	if act.HasRestorer() {
 		// Push the restorer return address.
 		// Note that this doesn't need to be popped.
-		if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil {
+		if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil {
 			return err
 		}
 	} else {
@@ -240,11 +239,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
 	// Copy out the stack frame.
 	var uc UContext64
-	if _, err := st.Pop(&uc); err != nil {
+	if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 	var info SignalInfo
-	if _, err := st.Pop(&info); err != nil {
+	if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 642c79dda..7fde5d34e 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package arch
 
 import (
-	"encoding/binary"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +26,8 @@ import (
 
 // SignalContext64 is equivalent to struct sigcontext, the type passed as the
 // second argument to signal handlers set by signal(2).
+//
+// +marshal
 type SignalContext64 struct {
 	FaultAddr uint64
 	Regs      [31]uint64
@@ -36,6 +39,7 @@ type SignalContext64 struct {
 	Reserved  [3568]uint8
 }
 
+// +marshal
 type aarch64Ctx struct {
 	Magic uint32
 	Size  uint32
@@ -43,6 +47,8 @@ type aarch64Ctx struct {
 
 // FpsimdContext is equivalent to struct fpsimd_context on arm64
 // (arch/arm64/include/uapi/asm/sigcontext.h).
+//
+// +marshal
 type FpsimdContext struct {
 	Head  aarch64Ctx
 	Fpsr  uint32
@@ -51,13 +57,15 @@ type FpsimdContext struct {
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+//
+// +marshal
 type UContext64 struct {
 	Flags  uint64
 	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
-	_pad [(1024 - 64) / 8]byte
+	_pad [120]byte // (1024 - 64) / 8 = 120
 	// sigcontext must be aligned to 16-byte
 	_pad2 [8]byte
 	// last for future expansion
@@ -94,11 +102,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 		},
 		Sigset: sigset,
 	}
-
-	ucSize := binary.Size(uc)
-	if ucSize < 0 {
-		panic("can't get size of UContext64")
-	}
+	ucSize := uc.SizeBytes()
 
 	// frameSize = ucSize + sizeof(siginfo).
 	// sizeof(siginfo) == 128.
@@ -119,14 +123,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	info.FixSignalCodeForUser()
 
 	// Set up the stack frame.
-	infoAddr, err := st.Push(info)
-	if err != nil {
+	if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
-	ucAddr, err := st.Push(uc)
-	if err != nil {
+	infoAddr := st.Bottom
+	if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
 		return err
 	}
+	ucAddr := st.Bottom
 
 	// Set up registers.
 	c.Regs.Sp = uint64(st.Bottom)
@@ -147,11 +151,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
 	// Copy out the stack frame.
 	var uc UContext64
-	if _, err := st.Pop(&uc); err != nil {
+	if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 	var info SignalInfo
-	if _, err := st.Pop(&info); err != nil {
+	if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
 		return 0, SignalStack{}, err
 	}
 
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 0fa738a1d..a1eae98f9 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -17,8 +17,8 @@
 package arch
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 const (
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 1108fa0bd..5f06c751d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -15,14 +15,16 @@
 package arch
 
 import (
-	"encoding/binary"
-	"fmt"
-
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Stack is a simple wrapper around a usermem.IO and an address.
+// Stack is a simple wrapper around a usermem.IO and an address. Stack
+// implements marshal.CopyContext, and marshallable values can be pushed or
+// popped from the stack through the marshal.Marshallable interface.
+//
+// Stack is not thread-safe.
 type Stack struct {
 	// Our arch info.
 	// We use this for automatic Native conversion of usermem.Addrs during
@@ -34,105 +36,60 @@ type Stack struct {
 
 	// Our current stack bottom.
 	Bottom usermem.Addr
-}
 
-// Push pushes the given values on to the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
-	for _, v := range vals {
-
-		// We convert some types to well-known serializable quanities.
-		var norm interface{}
-
-		// For array types, we will automatically add an appropriate
-		// terminal value. This is done simply to make the interface
-		// easier to use.
-		var term interface{}
-
-		switch v.(type) {
-		case string:
-			norm = []byte(v.(string))
-			term = byte(0)
-		case []int8, []uint8:
-			norm = v
-			term = byte(0)
-		case []int16, []uint16:
-			norm = v
-			term = uint16(0)
-		case []int32, []uint32:
-			norm = v
-			term = uint32(0)
-		case []int64, []uint64:
-			norm = v
-			term = uint64(0)
-		case []usermem.Addr:
-			// Special case: simply push recursively.
-			_, err := s.Push(s.Arch.Native(uintptr(0)))
-			if err != nil {
-				return 0, err
-			}
-			varr := v.([]usermem.Addr)
-			for i := len(varr) - 1; i >= 0; i-- {
-				_, err := s.Push(varr[i])
-				if err != nil {
-					return 0, err
-				}
-			}
-			continue
-		case usermem.Addr:
-			norm = s.Arch.Native(uintptr(v.(usermem.Addr)))
-		default:
-			norm = v
-		}
+	// Scratch buffer used for marshalling to avoid having to repeatedly
+	// allocate scratch memory.
+	scratchBuf []byte
+}
 
-		if term != nil {
-			_, err := s.Push(term)
-			if err != nil {
-				return 0, err
-			}
-		}
+// scratchBufLen is the default length of Stack.scratchBuf. The
+// largest structs the stack regularly serializes are arch.SignalInfo
+// and arch.UContext64. We'll set the default size as the larger of
+// the two, arch.UContext64.
+var scratchBufLen = (*UContext64)(nil).SizeBytes()
 
-		c := binary.Size(norm)
-		if c < 0 {
-			return 0, fmt.Errorf("bad binary.Size for %T", v)
-		}
-		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
-		if err != nil || c != n {
-			return 0, err
-		}
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (s *Stack) CopyScratchBuffer(size int) []byte {
+	if len(s.scratchBuf) < size {
+		s.scratchBuf = make([]byte, size)
+	}
+	return s.scratchBuf[:size]
+}
 
+// StackBottomMagic is the special address callers must past to all stack
+// marshalling operations to cause the src/dst address to be computed based on
+// the current end of the stack.
+const StackBottomMagic = ^usermem.Addr(0) // usermem.Addr(-1)
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes
+// computes an appropriate address based on the current end of the
+// stack. Callers use the sentinel address StackBottomMagic to marshal methods
+// to indicate this.
+func (s *Stack) CopyOutBytes(sentinel usermem.Addr, b []byte) (int, error) {
+	if sentinel != StackBottomMagic {
+		panic("Attempted to copy out to stack with absolute address")
+	}
+	c := len(b)
+	n, err := s.IO.CopyOut(context.Background(), s.Bottom-usermem.Addr(c), b, usermem.IOOpts{})
+	if err == nil && n == c {
 		s.Bottom -= usermem.Addr(n)
 	}
-
-	return s.Bottom, nil
+	return n, err
 }
 
-// Pop pops the given values off the stack.
-//
-// (This method supports Addrs and treats them as native types.)
-func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
-	for _, v := range vals {
-
-		vaddr, isVaddr := v.(*usermem.Addr)
-
-		var n int
-		var err error
-		if isVaddr {
-			value := s.Arch.Native(uintptr(0))
-			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
-			*vaddr = usermem.Addr(s.Arch.Value(value))
-		} else {
-			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
-		}
-		if err != nil {
-			return 0, err
-		}
-
+// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes
+// an appropriate address based on the current end of the stack. Callers must
+// use the sentinel address StackBottomMagic to marshal methods to indicate
+// this.
+func (s *Stack) CopyInBytes(sentinel usermem.Addr, b []byte) (int, error) {
+	if sentinel != StackBottomMagic {
+		panic("Attempted to copy in from stack with absolute address")
+	}
+	n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{})
+	if err == nil {
 		s.Bottom += usermem.Addr(n)
 	}
-
-	return s.Bottom, nil
+	return n, err
 }
 
 // Align aligns the stack to the given offset.
@@ -142,6 +99,22 @@ func (s *Stack) Align(offset int) {
 	}
 }
 
+// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null
+// byte at the end. On error, the contents of the stack and the bottom cursor
+// are undefined.
+func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) {
+	// Note: Stack grows up, so write the terminal null byte first.
+	nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0)
+	if err != nil {
+		return 0, err
+	}
+	n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs)
+	if err != nil {
+		return 0, err
+	}
+	return n + nNull, nil
+}
+
 // StackLayout describes the location of the arguments and environment on the
 // stack.
 type StackLayout struct {
@@ -177,11 +150,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	l.EnvvEnd = s.Bottom
 	envAddrs := make([]usermem.Addr, len(env))
 	for i := len(env) - 1; i >= 0; i-- {
-		addr, err := s.Push(env[i])
-		if err != nil {
+		if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil {
 			return StackLayout{}, err
 		}
-		envAddrs[i] = addr
+		envAddrs[i] = s.Bottom
 	}
 	l.EnvvStart = s.Bottom
 
@@ -189,11 +161,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	l.ArgvEnd = s.Bottom
 	argAddrs := make([]usermem.Addr, len(args))
 	for i := len(args) - 1; i >= 0; i-- {
-		addr, err := s.Push(args[i])
-		if err != nil {
+		if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil {
 			return StackLayout{}, err
 		}
-		argAddrs[i] = addr
+		argAddrs[i] = s.Bottom
 	}
 	l.ArgvStart = s.Bottom
 
@@ -222,26 +193,26 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 		auxv = append(auxv, usermem.Addr(a.Key), a.Value)
 	}
 	auxv = append(auxv, usermem.Addr(0))
-	_, err := s.Push(auxv)
+	_, err := s.pushAddrSliceAndTerminator(auxv)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push environment.
-	_, err = s.Push(envAddrs)
+	_, err = s.pushAddrSliceAndTerminator(envAddrs)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push args.
-	_, err = s.Push(argAddrs)
+	_, err = s.pushAddrSliceAndTerminator(argAddrs)
 	if err != nil {
 		return StackLayout{}, err
 	}
 
 	// Push arg count.
-	_, err = s.Push(usermem.Addr(len(args)))
-	if err != nil {
+	lenP := s.Arch.Native(uintptr(len(args)))
+	if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil {
 		return StackLayout{}, err
 	}
 
diff --git a/pkg/sentry/arch/stack_unsafe.go b/pkg/sentry/arch/stack_unsafe.go
new file mode 100644
index 000000000..a90d297ee
--- /dev/null
+++ b/pkg/sentry/arch/stack_unsafe.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// pushAddrSliceAndTerminator copies a slices of addresses to the stack, and
+// also pushes an extra null address element at the end of the slice.
+//
+// Internally, we unsafely transmute the slice type from the arch-dependent
+// []usermem.Addr type, to a slice of fixed-sized ints so that we can pass it to
+// go-marshal.
+//
+// On error, the contents of the stack and the bottom cursor are undefined.
+func (s *Stack) pushAddrSliceAndTerminator(src []usermem.Addr) (int, error) {
+	// Note: Stack grows upwards, so push the terminator first.
+	srcHdr := (*reflect.SliceHeader)(unsafe.Pointer(&src))
+	switch s.Arch.Width() {
+	case 8:
+		nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0)
+		if err != nil {
+			return 0, err
+		}
+		var dst []uint64
+		dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+		dstHdr.Data = srcHdr.Data
+		dstHdr.Len = srcHdr.Len
+		dstHdr.Cap = srcHdr.Cap
+		n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, dst)
+		// Ensures src doesn't get GCed until we're done using it through dst.
+		runtime.KeepAlive(src)
+		return n + nNull, err
+	case 4:
+		nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0)
+		if err != nil {
+			return 0, err
+		}
+		var dst []uint32
+		dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
+		dstHdr.Data = srcHdr.Data
+		dstHdr.Len = srcHdr.Len
+		dstHdr.Cap = srcHdr.Cap
+		n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, dst)
+		// Ensure src doesn't get GCed until we're done using it through dst.
+		runtime.KeepAlive(src)
+		return n + nNull, err
+	default:
+		panic("Unsupported arch width")
+	}
+}
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 8e5658c7a..dfd195a23 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -144,27 +144,7 @@ func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
 // RootContext returns a Context that may be used in tests that need root
 // credentials. Uses ptrace as the platform.Platform.
 func RootContext(tb testing.TB) context.Context {
-	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
-}
-
-// WithCreds returns a copy of ctx carrying creds.
-func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
-	return &authContext{ctx, creds}
-}
-
-type authContext struct {
-	context.Context
-	creds *auth.Credentials
-}
-
-// Value implements context.Context.
-func (ac *authContext) Value(key interface{}) interface{} {
-	switch key {
-	case auth.CtxCredentials:
-		return ac.creds
-	default:
-		return ac.Context.Value(key)
-	}
+	return auth.ContextWithCredentials(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
 }
 
 // WithLimitSet returns a copy of ctx carrying l.
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2c5d14be5..deaf5fa23 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -35,7 +35,6 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index dfa936563..1d88db12f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,8 +23,8 @@ import (
 	"text/tabwriter"
 	"time"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
@@ -183,9 +183,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		if initArgs.MountNamespaceVFS2 == nil {
 			// Set initArgs so that 'ctx' returns the namespace.
 			//
-			// MountNamespaceVFS2 adds a reference to the namespace, which is
-			// transferred to the new process.
+			// Add a reference to the namespace, which is transferred to the new process.
 			initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+			initArgs.MountNamespaceVFS2.IncRef()
 		}
 	} else {
 		if initArgs.MountNamespace == nil {
@@ -203,27 +203,17 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 	}
 	initArgs.Filename = resolved
 
-	fds := make([]int, len(args.FilePayload.Files))
-	for i, file := range args.FilePayload.Files {
-		if kernel.VFS2Enabled {
-			// Need to dup to remove ownership from os.File.
-			dup, err := unix.Dup(int(file.Fd()))
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
-			}
-			fds[i] = dup
-		} else {
-			// VFS1 dups the file on import.
-			fds[i] = int(file.Fd())
-		}
+	fds, err := fd.NewFromFiles(args.Files)
+	if err != nil {
+		return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
 	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
 	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
 	if err != nil {
-		if kernel.VFS2Enabled {
-			for _, fd := range fds {
-				unix.Close(fd)
-			}
-		}
 		return nil, 0, nil, nil, err
 	}
 
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index 41feeffe3..d800f2c85 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -69,5 +69,5 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
 			s.Kernel.Kill(kernel.ExitStatus{})
 		},
 	}
-	return saveOpts.Save(s.Kernel, s.Watchdog)
+	return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog)
 }
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
index abe58f818..4c8604d58 100644
--- a/pkg/sentry/devices/memdev/BUILD
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -18,9 +18,10 @@ go_library(
         "//pkg/rand",
         "//pkg/safemem",
         "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/mm",
-        "//pkg/sentry/pgalloc",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index 511179e31..fece3e762 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -24,6 +24,8 @@ import (
 const fullDevMinor = 7
 
 // fullDevice implements vfs.Device for /dev/full.
+//
+// +stateify savable
 type fullDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -38,6 +40,8 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // fullFD implements vfs.FileDescriptionImpl for /dev/full.
+//
+// +stateify savable
 type fullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 4918dbeeb..ff5837747 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -25,6 +25,8 @@ import (
 const nullDevMinor = 3
 
 // nullDevice implements vfs.Device for /dev/null.
+//
+// +stateify savable
 type nullDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -39,6 +41,8 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // nullFD implements vfs.FileDescriptionImpl for /dev/null.
+//
+// +stateify savable
 type nullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index 5e7fe0280..ac943e3ba 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -30,6 +30,8 @@ const (
 )
 
 // randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+//
+// +stateify savable
 type randomDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -44,6 +46,8 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry,
 }
 
 // randomFD implements vfs.FileDescriptionImpl for /dev/random.
+//
+// +stateify savable
 type randomFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 2e631a252..1929e41cd 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -16,9 +16,10 @@ package memdev
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -26,6 +27,8 @@ import (
 const zeroDevMinor = 5
 
 // zeroDevice implements vfs.Device for /dev/zero.
+//
+// +stateify savable
 type zeroDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -40,6 +43,8 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+//
+// +stateify savable
 type zeroFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -79,11 +84,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64,
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+	if opts.Private || !opts.MaxPerms.Write {
+		// This mapping will never permit writing to the "underlying file" (in
+		// Linux terms, it isn't VM_SHARED), so implement it as an anonymous
+		// mapping, but back it with fd; this is what Linux does, and is
+		// actually application-visible because the resulting VMA will show up
+		// in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
+		// "/dev/zero (deleted)".
+		opts.Offset = 0
+		opts.MappingIdentity = &fd.vfsfd
+		opts.MappingIdentity.IncRef()
+		return nil
+	}
+	tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
 	if err != nil {
 		return err
 	}
-	opts.MappingIdentity = m
-	opts.Mappable = m
-	return nil
+	defer tmpfsFD.DecRef(ctx)
+	return tmpfsFD.ConfigureMMap(ctx, opts)
 }
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
index 664e54498..a287c65ca 100644
--- a/pkg/sentry/devices/ttydev/ttydev.go
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -30,6 +30,8 @@ const (
 )
 
 // ttyDevice implements vfs.Device for /dev/tty.
+//
+// +stateify savable
 type ttyDevice struct{}
 
 // Open implements vfs.Device.Open.
diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD
index 71c59287c..14a8bf9cd 100644
--- a/pkg/sentry/devices/tundev/BUILD
+++ b/pkg/sentry/devices/tundev/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/arp",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
index a40625e19..ff5d49fbd 100644
--- a/pkg/sentry/devices/tundev/tundev.go
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -16,6 +16,8 @@
 package tundev
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -26,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -36,6 +39,8 @@ const (
 )
 
 // tunDevice implements vfs.Device for /dev/net/tun.
+//
+// +stateify savable
 type tunDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -50,6 +55,8 @@ func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opt
 }
 
 // tunFD implements vfs.FileDescriptionImpl for /dev/net/tun.
+//
+// +stateify savable
 type tunFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -64,12 +71,13 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -79,13 +87,20 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
-		return 0, fd.device.SetIff(stack.Stack, req.Name(), flags)
+		created, err := fd.device.SetIff(stack.Stack, req.Name(), flags)
+		if err == nil && created {
+			// Always start with an ARP address for interfaces so they can handle ARP
+			// packets.
+			nicID := fd.device.NICID()
+			if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+			}
+		}
+		return 0, err
 
 	case linux.TUNGETIFF:
 		var req linux.IFReq
@@ -97,9 +112,7 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg
 		flags := fd.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
index 5e41ceb4e..6b4f8b0ed 100644
--- a/pkg/sentry/fdimport/BUILD
+++ b/pkg/sentry/fdimport/BUILD
@@ -10,6 +10,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/fsimpl/host",
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
index 1b7cb94c0..314661475 100644
--- a/pkg/sentry/fdimport/fdimport.go
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
@@ -27,8 +28,9 @@ import (
 
 // Import imports a slice of FDs into the given FDTable. If console is true,
 // sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
-// stderr. Upon success, Import takes ownership of all FDs.
-func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+// stderr. Used FDs are either closed or released. It's safe for the caller to
+// close any remaining files upon return.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if kernel.VFS2Enabled {
 		ttyFile, err := importVFS2(ctx, fdTable, console, fds)
 		return nil, ttyFile, err
@@ -37,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in
 	return ttyFile, nil, err
 }
 
-func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) {
 	var ttyFile *fs.File
 	for appFD, hostFD := range fds {
 		var appFile *fs.File
@@ -46,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+				appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				_ = hostFD.Close() // FD is dup'd i ImportFile.
 
 				// Remember this in the TTY file, as we will
 				// use it for the other stdio FDs.
@@ -65,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 		} else {
 			// Import the file as a regular host file.
 			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+			appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			_ = hostFD.Close() // FD is dup'd i ImportFile.
 		}
 
 		// Add the file to the FD map.
@@ -84,7 +88,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []
 	return ttyFile.FileOperations.(*host.TTYFileOperations), nil
 }
 
-func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, fmt.Errorf("cannot find kernel from context")
@@ -98,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			// Import the file as a host TTY file.
 			if ttyFile == nil {
 				var err error
-				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */)
 				if err != nil {
 					return nil, err
 				}
 				defer appFile.DecRef(ctx)
+				hostFD.Release() // FD is transfered to host FD.
 
 				// Remember this in the TTY file, as we will use it for the other stdio
 				// FDs.
@@ -115,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi
 			}
 		} else {
 			var err error
-			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */)
 			if err != nil {
 				return nil, err
 			}
 			defer appFile.DecRef(ctx)
+			hostFD.Release() // FD is transfered to host FD.
 		}
 
 		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 735452b07..ff2fe6712 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error {
 // leave the upper filesystem filled with any number of parent directories
 // but the upper filesystem will never be in an inconsistent state.
 //
-// Preconditions:
-// - d.Inode.overlay is non-nil.
+// Preconditions: d.Inode.overlay is non-nil.
 func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
 	for {
 		// Did we race with another copy up or does there
@@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
 // Returns a generic error on failure.
 //
 // Preconditions:
-// - parent.Inode.overlay.upper must be non-nil.
-// - next.Inode.overlay.copyMu must be locked writable.
-// - next.Inode.overlay.lower must be non-nil.
-// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+// * parent.Inode.overlay.upper must be non-nil.
+// * next.Inode.overlay.copyMu must be locked writable.
+// * next.Inode.overlay.lower must be non-nil.
+// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
 //   or Symlink.
-// - upper filesystem must support setting file ownership and timestamps.
+// * upper filesystem must support setting file ownership and timestamps.
 func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Extract the attributes of the file we wish to copy.
 	attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9379a4d7b..6b7b451b8 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
         "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/arp",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index ec474e554..19ffdec47 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -15,6 +15,8 @@
 package dev
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -25,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -60,7 +63,7 @@ func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
-func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (*netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
 }
 
@@ -80,21 +83,22 @@ type netTunFileOperations struct {
 var _ fs.FileOperations = (*netTunFileOperations)(nil)
 
 // Release implements fs.FileOperations.Release.
-func (fops *netTunFileOperations) Release(ctx context.Context) {
-	fops.device.Release(ctx)
+func (n *netTunFileOperations) Release(ctx context.Context) {
+	n.device.Release(ctx)
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	request := args[1].Uint()
 	data := args[2].Pointer()
 
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		panic("Ioctl should be called from a task context")
+	}
+
 	switch request {
 	case linux.TUNSETIFF:
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic("Ioctl should be called from a task context")
-		}
 		if !t.HasCapability(linux.CAP_NET_ADMIN) {
 			return 0, syserror.EPERM
 		}
@@ -104,27 +108,32 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 		}
 
 		var req linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := req.CopyIn(t, data); err != nil {
 			return 0, err
 		}
 		flags := usermem.ByteOrder.Uint16(req.Data[:])
-		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+		created, err := n.device.SetIff(stack.Stack, req.Name(), flags)
+		if err == nil && created {
+			// Always start with an ARP address for interfaces so they can handle ARP
+			// packets.
+			nicID := n.device.NICID()
+			if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID))
+			}
+		}
+		return 0, err
 
 	case linux.TUNGETIFF:
 		var req linux.IFReq
 
-		copy(req.IFName[:], fops.device.Name())
+		copy(req.IFName[:], n.device.Name())
 
 		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
 		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
-		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		flags := n.device.Flags() | linux.IFF_NOFILTER
 		usermem.ByteOrder.PutUint16(req.Data[:], flags)
 
-		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err := req.CopyOut(t, data)
 		return 0, err
 
 	default:
@@ -133,41 +142,41 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u
 }
 
 // Write implements fs.FileOperations.Write.
-func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+func (n *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	data := make([]byte, src.NumBytes())
 	if _, err := src.CopyIn(ctx, data); err != nil {
 		return 0, err
 	}
-	return fops.device.Write(data)
+	return n.device.Write(data)
 }
 
 // Read implements fs.FileOperations.Read.
-func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	data, err := fops.device.Read()
+func (n *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := n.device.Read()
 	if err != nil {
 		return 0, err
 	}
-	n, err := dst.CopyOut(ctx, data)
-	if n > 0 && n < len(data) {
+	bytesCopied, err := dst.CopyOut(ctx, data)
+	if bytesCopied > 0 && bytesCopied < len(data) {
 		// Not an error for partial copying. Packet truncated.
 		err = nil
 	}
-	return int64(n), err
+	return int64(bytesCopied), err
 }
 
 // Readiness implements watier.Waitable.Readiness.
-func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return fops.device.Readiness(mask)
+func (n *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return n.device.Readiness(mask)
 }
 
 // EventRegister implements watier.Waitable.EventRegister.
-func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	fops.device.EventRegister(e, mask)
+func (n *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	n.device.EventRegister(e, mask)
 }
 
 // EventUnregister implements watier.Waitable.EventUnregister.
-func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
-	fops.device.EventUnregister(e)
+func (n *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	n.device.EventUnregister(e)
 }
 
 // isNetTunSupported returns whether /dev/net/tun device is supported for s.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a2f751068..00c526b03 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
 // Inode.Lookup, otherwise walk will keep d.mu locked.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
 	if !IsDir(d.Inode.StableAttr) {
 		return nil, syscall.ENOTDIR
@@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
 // exists returns true if name exists in relation to d.
 //
 // Preconditions:
-// - renameMu must be held for reading.
-// - d.mu must be held.
-// - name must must not contain "/"s.
+// * renameMu must be held for reading.
+// * d.mu must be held.
+// * name must must not contain "/"s.
 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 305c0f840..6ec721022 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -159,8 +159,9 @@ type FileOperations interface {
 	// io provides access to the virtual memory space to which pointers in args
 	// refer.
 	//
-	// Preconditions: The AddressSpace (if any) that io refers to is activated.
-	// Must only be called from a task goroutine.
+	// Preconditions:
+	// * The AddressSpace (if any) that io refers to is activated.
+	// * Must only be called from a task goroutine.
 	Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
 }
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index bbafebf03..1dc409d38 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -70,7 +70,9 @@ func (seg FileRangeIterator) FileRange() memmap.FileRange {
 
 // FileRangeOf returns the FileRange mapped by mr.
 //
-// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * seg.Range().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
 	frstart := seg.Value() + (mr.Start - seg.Start())
 	return memmap.FileRange{frstart, frstart + mr.Length()}
@@ -82,15 +84,18 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRan
 // returns a successful partial read, Fill will call it repeatedly until all
 // bytes have been read.) EOF is handled consistently with the requirements of
 // mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
-// invalid.
+// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
+// will be zeroed without calling readAt.
 //
 // Fill may read offsets outside of required, but will never read offsets
 // outside of optional. It returns a non-nil error if any error occurs, even
 // if the error only affects offsets in optional, but not in required.
 //
-// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-// required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+// Preconditions:
+// * required.Length() > 0.
+// * optional.IsSupersetOf(required).
+// * required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	gap := frs.LowerBoundGap(required.Start)
 	for gap.Ok() && gap.Start() < required.End {
 		if gap.Range().Length() == 0 {
@@ -103,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 		fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
 			var done uint64
 			for !dsts.IsEmpty() {
-				n, err := readAt(ctx, dsts, gr.Start+done)
+				n, err := func() (uint64, error) {
+					off := gr.Start + done
+					if off >= fileSize {
+						return 0, io.EOF
+					}
+					if off+dsts.NumBytes() > fileSize {
+						rd := fileSize - off
+						n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
+						if n == rd && err == nil {
+							return n, io.EOF
+						}
+						return n, err
+					}
+					return readAt(ctx, dsts, off)
+				}()
 				done += n
 				dsts = dsts.DropFirst64(n)
 				if err != nil {
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index ef0113b52..4468f5dd2 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -70,6 +70,13 @@ func (f *HostFileMapper) Init() {
 	f.mappings = make(map[uint64]mapping)
 }
 
+// IsInited returns true if f.Init() has been called. This is used when
+// restoring a checkpoint that contains a HostFileMapper that may or may not
+// have been initialized.
+func (f *HostFileMapper) IsInited() bool {
+	return f.refs != nil
+}
+
 // NewHostFileMapper returns an initialized HostFileMapper allocated on the
 // heap with no references or cached mappings.
 func NewHostFileMapper() *HostFileMapper {
@@ -80,7 +87,9 @@ func NewHostFileMapper() *HostFileMapper {
 
 // IncRefOn increments the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -97,7 +106,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
 
 // DecRefOn decrements the reference count on all offsets in mr.
 //
-// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+// Preconditions:
+// * mr.Length() != 0.
+// * mr.Start and mr.End must be page-aligned.
 func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
 	f.refsMu.Lock()
 	defer f.refsMu.Unlock()
@@ -204,7 +215,9 @@ func (f *HostFileMapper) UnmapAll() {
 	}
 }
 
-// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+// Preconditions:
+// * f.mapsMu must be locked.
+// * f.mappings[chunkStart] == m.
 func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
 	if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
 		// This leaks address space and is unexpected, but is otherwise
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index fe8b0b6ac..82eda3e43 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -444,7 +443,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
 // time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) {
 	c.attr.AccessTime = now
 	c.dirtyAttr.AccessTime = true
 }
@@ -461,7 +460,7 @@ func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx contex
 // and status change times in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) {
 	c.attr.ModificationTime = now
 	c.dirtyAttr.ModificationTime = true
 	c.attr.StatusChangeTime = now
@@ -480,7 +479,7 @@ func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
 // in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) {
 	c.attr.StatusChangeTime = now
 	c.dirtyAttr.StatusChangeTime = true
 }
@@ -645,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 					End:   fs.OffsetPageEnd(int64(gapMR.End)),
 				}
 				optMR := gap.Range()
-				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+				err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
 				mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
 				seg, gap = rw.c.cache.Find(uint64(rw.offset))
 				if !seg.Ok() {
@@ -672,9 +671,6 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 				// Continue.
 				seg, gap = gap.NextSegment(), FileRangeGapIterator{}
 			}
-
-		default:
-			break
 		}
 	}
 	unlock()
@@ -684,7 +680,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 // maybeGrowFile grows the file's size if data has been written past the old
 // size.
 //
-// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+// Preconditions:
+// * rw.c.attrMu must be locked.
+// * rw.c.dataMu must be locked.
 func (rw *inodeReadWriter) maybeGrowFile() {
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
@@ -766,9 +764,6 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 
 			// Continue.
 			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
-		default:
-			break
 		}
 	}
 	rw.maybeGrowFile()
@@ -875,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	}
 
 	mf := c.mfp.MemoryFile()
-	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
+	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index 2ca84dd74..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -79,7 +79,7 @@ ops can be implemented in parallel.
 -   Implement `/dev/fuse` - a character device used to establish an FD for
     communication between the sentry and the server daemon.
 
--   Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+-   Implement basic FUSE ops like `FUSE_INIT`.
 
 #### Read-only mount with basic file operations
 
@@ -95,6 +95,103 @@ ops can be implemented in parallel.
 -   Implement the remaining FUSE ops and decide if we can omit rarely used
     operations like ioctl.
 
+### Design Details
+
+#### Lifecycle for a FUSE Request
+
+-   User invokes a syscall
+-   Sentry prepares corresponding request
+    -   If FUSE device is available
+        -   Write the request in binary
+    -   If FUSE device is full
+        -   Kernel task blocked until available
+-   Sentry notifies the readers of fuse device that it's ready for read
+-   FUSE daemon reads the request and processes it
+-   Sentry waits until a reply is written to the FUSE device
+    -   but returns directly for async requests
+-   FUSE daemon writes to the fuse device
+-   Sentry processes the reply
+    -   For sync requests, unblock blocked kernel task
+    -   For async requests, execute pre-specified callback if any
+-   Sentry returns the syscall to the user
+
+#### Channels and Queues for Requests in Different Stages
+
+`connection.initializedChan`
+
+-   a channel that the requests issued before connection initialization blocks
+    on.
+
+`fd.queue`
+
+-   a queue of requests that haven’t been read by the FUSE daemon yet.
+
+`fd.completions`
+
+-   a map of the requests that have been prepared but not yet received a
+    response, including the ones on the `fd.queue`.
+
+`fd.waitQueue`
+
+-   a queue of waiters that is waiting for the fuse device fd to be available,
+    such as the FUSE daemon.
+
+`fd.fullQueueCh`
+
+-   a channel that the kernel task will be blocked on when the fd is not
+    available.
+
+#### Basic I/O Implementation
+
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
+
+##### Basic FUSE Read
+
+The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
+`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually
+reach our implementation of those interface functions located at
+`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
+
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
+
+##### Basic FUSE Write
+
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
+
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+
+The parameters `max_write` and `max_pages` restrict the number of bytes in one
+`FUSE_WRITE`. There are code handling fragmented writes in current
+implementation.
+
+To have better performance: the extra copy created to store the bytes to write
+can be replaced by the buffer cache as well.
+
 # Appendix
 
 ## FUSE Protocol
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d41d23a43..1368014c4 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/fdnotifier",
         "//pkg/iovec",
         "//pkg/log",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/secio",
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5d4f312cf..c8231e0aa 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
 	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
 
 	if n > length {
-		return length, n, msg.Controllen, controlTrunc, err
+		return length, n, msg.Controllen, controlTrunc, nil
 	}
 
-	return n, n, msg.Controllen, controlTrunc, err
+	return n, n, msg.Controllen, controlTrunc, nil
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index e29ae00f2..1183727ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -17,6 +17,7 @@ package host
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -53,7 +54,7 @@ type TTYFileOperations struct {
 func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
 		fileOperations: fileOperations{iops: iops},
-		termios:        linux.DefaultSlaveTermios,
+		termios:        linux.DefaultReplicaTermios,
 	})
 }
 
@@ -123,6 +124,11 @@ func (t *TTYFileOperations) Release(ctx context.Context) {
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0].  This is the real FD:
 	fd := t.fileOperations.iops.fileState.FD()
 	ioctl := args[1].Uint64()
@@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID
 		// namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO
 		// background ones) can set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b79cd9877..004910453 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string,
 // SetXattr calls i.InodeOperations.SetXattr with i as the Inode.
 func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error {
 	if i.overlay != nil {
-		return overlaySetxattr(ctx, i.overlay, d, name, value, flags)
+		return overlaySetXattr(ctx, i.overlay, d, name, value, flags)
 	}
 	return i.InodeOperations.SetXattr(ctx, i, name, value, flags)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index dc2e353d9..b16ab08ba 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"fmt"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 
 	// Don't forward the value of the extended attribute if it would
 	// unexpectedly change the behavior of a wrapping overlay layer.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return "", syserror.ENODATA
 	}
 
@@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin
 	return s, err
 }
 
-func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
+func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error {
 	// Don't allow changes to overlay xattrs through a setxattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
@@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
 		// overlay attributes.
-		if strings.HasPrefix(XattrOverlayPrefix, name) {
+		if isXattrOverlay(name) {
 			delete(names, name)
 		}
 	}
@@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st
 
 func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
 	// Don't allow changes to overlay xattrs through a removexattr syscall.
-	if strings.HasPrefix(XattrOverlayPrefix, name) {
+	if isXattrOverlay(name) {
 		return syserror.EPERM
 	}
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 35013a21b..01a1235b8 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool {
 // NewOverlayRoot produces the root of an overlay.
 //
 // Preconditions:
-//
-// - upper and lower must be non-nil.
-// - upper must not be an overlay.
-// - lower should not expose character devices, pipes, or sockets, because
+// * upper and lower must be non-nil.
+// * upper must not be an overlay.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsDir(upper.StableAttr) {
 		return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
@@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 // NewOverlayRootFile produces the root of an overlay that points to a file.
 //
 // Preconditions:
-//
-// - lower must be non-nil.
-// - lower should not expose character devices, pipes, or sockets, because
+// * lower must be non-nil.
+// * lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported. Neither it can be a dir.
-// - lower must not require that file objects be revalidated.
-// - lower must not have dynamic file/directory content.
+// * lower must not require that file objects be revalidated.
+// * lower must not have dynamic file/directory content.
 func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsRegular(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a regular file")
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 77c2c5c0e..b8b2281a8 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 8615b60f0..e555672ad 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -54,7 +55,7 @@ type tcpMemInode struct {
 
 	// size stores the tcp buffer size during save, and sets the buffer
 	// size in netstack in restore. We must save/restore this here, since
-	// netstack itself is stateless.
+	// a netstack instance is created on restore.
 	size inet.TCPBufferSize
 
 	// mu protects against concurrent reads/writes to files based on this
@@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
+
+	// Only consider size of one memory page for input for performance reasons.
+	// We are only reading if it's zero or not anyway.
 	src = src.TakeFirst(usermem.PageSize - 1)
 
 	var v int32
@@ -383,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S
 	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
 }
 
+// ipForwarding implements fs.InodeOperations.
+//
+// ipForwarding is used to enable/disable packet forwarding of netstack.
+//
+// +stateify savable
+type ipForwarding struct {
+	fsutil.SimpleFileInode
+
+	stack inet.Stack `state:"wait"`
+
+	// enabled stores the IPv4 forwarding state on save.
+	// We must save/restore this here, since a netstack instance
+	// is created on restore.
+	enabled *bool
+}
+
+func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+	ipf := &ipForwarding{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		stack:           s,
+	}
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(ctx, ipf, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// +stateify savable
+type ipForwardingFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	ipf *ipForwarding
+
+	stack inet.Stack `state:"wait"`
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{
+		stack: ipf.stack,
+		ipf:   ipf,
+	}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+
+	if f.ipf.enabled == nil {
+		enabled := f.stack.Forwarding(ipv4.ProtocolNumber)
+		f.ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *f.ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux
+		// stores these as an integer, so if you write "2" into
+		// ip_forward, you should get 2 back.
+		val = "1\n"
+	}
+	n, err := dst.CopyOut(ctx, []byte(val))
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Offset is ignored, multiple writes are not supported.
+func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Only consider size of one memory page for input for performance reasons.
+	// We are only reading if it's zero or not anyway.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return n, err
+	}
+	if f.ipf.enabled == nil {
+		f.ipf.enabled = new(bool)
+	}
+	*f.ipf.enabled = v != 0
+	return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled)
+}
+
 func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
 	contents := map[string]*fs.Inode{
 		// Add tcp_sack.
 		"tcp_sack": newTCPSackInode(ctx, msrc, s),
 
+		// Add ip_forward.
+		"ip_forward": newIPForwardingInode(ctx, msrc, s),
+
 		// The following files are simple stubs until they are
 		// implemented in netstack, most of these files are
 		// configuration related. We use the value closest to the
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 6eba709c6..4cb4741af 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -14,7 +14,11 @@
 
 package proc
 
-import "fmt"
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+)
 
 // beforeSave is invoked by stateify.
 func (t *tcpMemInode) beforeSave() {
@@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() {
 		}
 	}
 }
+
+// afterLoad is invoked by stateify.
+func (ipf *ipForwarding) afterLoad() {
+	if ipf.enabled != nil {
+		if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+			panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err))
+		}
+	}
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 355e83d47..6ef5738e7 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) {
 		}
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+			ipf := &ipForwarding{stack: s}
+			file := &ipForwardingFile{
+				stack: s,
+				ipf:   ipf,
+			}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if got, want := s.IPForwarding, c.final; got != want {
+				t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+			}
+
+		})
+	}
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9cf7f2a62..22d658acf 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
 		"auxv":          newAuxvec(t, msrc),
 		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
 		"comm":          newComm(t, msrc),
+		"cwd":           newCwd(t, msrc),
 		"environ":       newExecArgInode(t, msrc, environExecArg),
 		"exe":           newExe(t, msrc),
 		"fd":            newFdDir(t, msrc),
@@ -300,6 +301,49 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	return exec.PathnameWithDeleted(ctx), nil
 }
 
+// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file.
+//
+// +stateify savable
+type cwd struct {
+	ramfs.Symlink
+
+	t *kernel.Task
+}
+
+func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	cwdSymlink := &cwd{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		t:       t,
+	}
+	return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t)
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if !kernel.ContextCanTrace(ctx, e.t, false) {
+		return "", syserror.EACCES
+	}
+	if err := checkTaskState(e.t); err != nil {
+		return "", err
+	}
+	cwd := e.t.FSContext().WorkingDirectory()
+	if cwd == nil {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer cwd.DecRef(ctx)
+
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	name, _ := cwd.FullName(root)
+	return name, nil
+}
+
 // namespaceSymlink represents a symlink in the namespacefs, such as the files
 // in /proc/<pid>/ns.
 //
@@ -604,7 +648,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 	var vss, rss, data uint64
 	s.t.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+			fds = fdTable.CurrentMaxFDs()
 		}
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1dc75291d..fc0498f17 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	}
 
 	mf := f.kernel.MemoryFile()
-	cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index b095312fe..998b697ca 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -16,6 +16,8 @@
 package tmpfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -32,9 +34,15 @@ import (
 var fsInfo = fs.Info{
 	Type: linux.TMPFS_MAGIC,
 
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
 	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
-	TotalBlocks: 0,
-	FreeBlocks:  0,
+	TotalBlocks: math.MaxInt64 / usermem.PageSize,
+	FreeBlocks:  math.MaxInt64 / usermem.PageSize,
 }
 
 // rename implements fs.InodeOperations.Rename for tmpfs nodes.
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5cb0e0417..e6d0eb359 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -10,13 +10,14 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 463f6189e..c2da80bc2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -37,14 +37,14 @@ import (
 // This indirectly manages all terminals within the mount.
 //
 // New Terminals are created by masterInodeOperations.GetFile, which registers
-// the slave Inode in the this directory for discovery via Lookup/Readdir. The
-// slave inode is unregistered when the master file is Released, as the slave
+// the replica Inode in the this directory for discovery via Lookup/Readdir. The
+// replica inode is unregistered when the master file is Released, as the replica
 // is no longer discoverable at that point.
 //
 // References on the underlying Terminal are held by masterFileOperations and
-// slaveInodeOperations.
+// replicaInodeOperations.
 //
-// masterInodeOperations and slaveInodeOperations hold a pointer to
+// masterInodeOperations and replicaInodeOperations hold a pointer to
 // dirInodeOperations, which is reference counted by the refcount their
 // corresponding Dirents hold on their parent (this directory).
 //
@@ -76,16 +76,16 @@ type dirInodeOperations struct {
 	// master is the master PTY inode.
 	master *fs.Inode
 
-	// slaves contains the slave inodes reachable from the directory.
+	// replicas contains the replica inodes reachable from the directory.
 	//
-	// A new slave is added by allocateTerminal and is removed by
+	// A new replica is added by allocateTerminal and is removed by
 	// masterFileOperations.Release.
 	//
-	// A reference is held on every slave in the map.
-	slaves map[uint32]*fs.Inode
+	// A reference is held on every replica in the map.
+	replicas map[uint32]*fs.Inode
 
 	// dentryMap is a SortedDentryMap used to implement Readdir containing
-	// the master and all entries in slaves.
+	// the master and all entries in replicas.
 	dentryMap *fs.SortedDentryMap
 
 	// next is the next pty index to use.
@@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 	d := &dirInodeOperations{
 		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
 		msrc:                  m,
-		slaves:                make(map[uint32]*fs.Inode),
+		replicas:              make(map[uint32]*fs.Inode),
 		dentryMap:             fs.NewSortedDentryMap(nil),
 	}
 	// Linux devpts uses a default mode of 0000 for ptmx which can be
@@ -133,7 +133,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) {
 	defer d.mu.Unlock()
 
 	d.master.DecRef(ctx)
-	if len(d.slaves) != 0 {
+	if len(d.replicas) != 0 {
 		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
 	}
 }
@@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str
 		return fs.NewDirent(ctx, d.master, name), nil
 	}
 
-	// Slave number?
+	// Replica number?
 	n, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		// Not found.
 		return nil, syserror.ENOENT
 	}
 
-	s, ok := d.slaves[uint32(n)]
+	s, ok := d.replicas[uint32(n)]
 	if !ok {
 		return nil, syserror.ENOENT
 	}
@@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 		return nil, syserror.ENOMEM
 	}
 
-	if _, ok := d.slaves[n]; ok {
+	if _, ok := d.replicas[n]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", n))
 	}
 
@@ -244,19 +244,19 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e
 	d.next++
 
 	// The reference returned by newTerminal is returned to the caller.
-	// Take another for the slave inode.
+	// Take another for the replica inode.
 	t.IncRef()
 
 	// Create a pts node. The owner is based on the context that opens
 	// ptmx.
 	creds := auth.CredentialsFromContext(ctx)
 	uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
-	slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+	replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
 
-	d.slaves[n] = slave
+	d.replicas[n] = replica
 	d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
-		Type:    slave.StableAttr.Type,
-		InodeID: slave.StableAttr.InodeID,
+		Type:    replica.StableAttr.Type,
+		InodeID: replica.StableAttr.InodeID,
 	})
 
 	return t, nil
@@ -267,18 +267,18 @@ func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	// The slave end disappears from the directory when the master end is
-	// closed, even if the slave end is open elsewhere.
+	// The replica end disappears from the directory when the master end is
+	// closed, even if the replica end is open elsewhere.
 	//
 	// N.B. since we're using a backdoor method to remove a directory entry
 	// we won't properly fire inotify events like Linux would.
-	s, ok := d.slaves[t.n]
+	s, ok := d.replicas[t.n]
 	if !ok {
 		panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
 	}
 
 	s.DecRef(ctx)
-	delete(d.slaves, t.n)
+	delete(d.replicas, t.n)
 	d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
 }
 
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 2d4d44bf3..13f4901db 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -79,8 +79,8 @@ type superOperations struct{}
 //
 // It always returns true, forcing a Lookup for all entries.
 //
-// Slave entries are dropped from dir when their master is closed, so an
-// existing slave Dirent in the tree is not sufficient to guarantee that it
+// Replica entries are dropped from dir when their master is closed, so an
+// existing replica Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
 func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
 	return true
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 2e9dd2d55..b34f4a0eb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -43,7 +44,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -54,8 +55,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -64,7 +65,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -103,8 +104,8 @@ type lineDiscipline struct {
 	// masterWaiter is used to wait on the master end of the TTY.
 	masterWaiter waiter.Queue `state:"zerovalue"`
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e00746017..b91184b1b 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -17,9 +17,11 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mf.t.ld.outputQueueReadSize(t, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
-		return mf.t.ld.getTermios(ctx, io, args)
+		// of the replica end.
+		return mf.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
-		return mf.t.ld.setTermios(ctx, io, args)
+		// of the replica end.
+		return mf.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mf.t.ld.setTermios(ctx, io, args)
+		return mf.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mf.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mf.t.ld.windowSize(ctx, io, args)
+		return 0, mf.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mf.t.ld.setWindowSize(ctx, io, args)
+		return 0, mf.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index ceabb9b1e..79975d812 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -17,8 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -32,7 +34,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
@@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
@@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go
index 7c7292687..385d230fb 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/replica.go
@@ -17,9 +17,11 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -27,11 +29,11 @@ import (
 
 // LINT.IfChange
 
-// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// replicaInodeOperations are the fs.InodeOperations for the replica end of the
 // Terminal (pts file).
 //
 // +stateify savable
-type slaveInodeOperations struct {
+type replicaInodeOperations struct {
 	fsutil.SimpleFileInode
 
 	// d is the containing dir.
@@ -41,13 +43,13 @@ type slaveInodeOperations struct {
 	t *Terminal
 }
 
-var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+var _ fs.InodeOperations = (*replicaInodeOperations)(nil)
 
-// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+// newReplicaInode creates an fs.Inode for the replica end of a terminal.
 //
-// newSlaveInode takes ownership of t.
-func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
-	iops := &slaveInodeOperations{
+// newReplicaInode takes ownership of t.
+func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+	iops := &replicaInodeOperations{
 		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
 		d:               d,
 		t:               t,
@@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
 		Type:    fs.CharacterDevice,
 		// See fs/devpts/inode.c:devpts_fill_super.
 		BlockSize:       1024,
-		DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+		DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR,
 		DeviceFileMinor: t.n,
 	})
 }
 
 // Release implements fs.InodeOperations.Release.
-func (si *slaveInodeOperations) Release(ctx context.Context) {
+func (si *replicaInodeOperations) Release(ctx context.Context) {
 	si.t.DecRef(ctx)
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
 //
 // This may race with destruction of the terminal. If the terminal is gone, it
 // returns ENOENT.
-func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil
 }
 
-// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+// replicaFileOperations are the fs.FileOperations for the replica end of a
+// terminal.
 //
 // +stateify savable
-type slaveFileOperations struct {
+type replicaFileOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
@@ -100,79 +103,84 @@ type slaveFileOperations struct {
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// si is the inode operations.
-	si *slaveInodeOperations
+	si *replicaInodeOperations
 }
 
-var _ fs.FileOperations = (*slaveFileOperations)(nil)
+var _ fs.FileOperations = (*replicaFileOperations)(nil)
 
 // Release implements fs.FileOperations.Release.
-func (sf *slaveFileOperations) Release(context.Context) {
+func (sf *replicaFileOperations) Release(context.Context) {
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sf.si.t.ld.replicaWaiter.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
-	sf.si.t.ld.slaveWaiter.EventUnregister(e)
+func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) {
+	sf.si.t.ld.replicaWaiter.EventUnregister(e)
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sf.si.t.ld.slaveReadiness()
+func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sf.si.t.ld.replicaReadiness()
 }
 
 // Read implements fs.FileOperations.Read.
-func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
 	return sf.si.t.ld.inputQueueRead(ctx, dst)
 }
 
 // Write implements fs.FileOperations.Write.
-func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	return sf.si.t.ld.outputQueueWrite(ctx, src)
 }
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
-		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
+		return 0, sf.si.t.ld.inputQueueReadSize(t, args)
 	case linux.TCGETS:
-		return sf.si.t.ld.getTermios(ctx, io, args)
+		return sf.si.t.ld.getTermios(t, args)
 	case linux.TCSETS:
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return sf.si.t.ld.setTermios(ctx, io, args)
+		return sf.si.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(sf.si.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCGWINSZ:
-		return 0, sf.si.t.ld.windowSize(ctx, io, args)
+		return 0, sf.si.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+		return 0, sf.si.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+		return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+		return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
 
-// LINT.ThenChange(../../fsimpl/devpts/slave.go)
+// LINT.ThenChange(../../fsimpl/devpts/replica.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ddcccf4da..4f431d74d 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,10 +17,10 @@ package tty
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // LINT.IfChange
@@ -44,19 +44,19 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		d:          d,
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		d:           d,
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
@@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
@@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
 
 // LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 2cbc05678..49edee83d 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
index 2f5a43b84..124bc95ed 100644
--- a/pkg/sentry/fs/user/path.go
+++ b/pkg/sentry/fs/user/path.go
@@ -121,6 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s
 
 func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	for _, p := range paths {
 		if !path.IsAbs(p) {
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index 936fd3932..1f8684dc6 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -105,6 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.
 	const defaultHome = "/"
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 
 	creds := auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
index 323506d33..be0900030 100644
--- a/pkg/sentry/fsbridge/vfs.go
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -122,7 +122,7 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
 // remainingTraversals is not configurable in VFS2, all callers are using the
 // default anyways.
 func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
-	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+	vfsObj := l.root.Mount().Filesystem().VirtualFilesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	path := fspath.Parse(pathname)
 	pop := &vfs.PathOperation{
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 93512c9b6..6af3c3781 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "root_inode_refs",
+    out = "root_inode_refs.go",
+    package = "devpts",
+    prefix = "rootInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "rootInode",
+    },
+)
+
 go_library(
     name = "devpts",
     srcs = [
@@ -9,15 +21,22 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
+        "root_inode_refs.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7169e91af..9185877f6 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,29 +35,56 @@ import (
 const Name = "devpts"
 
 // FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
+//
+// +stateify savable
+type FilesystemType struct {
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+	initErr  error
+
+	// fs backs all mounts of this FilesystemType. root is fs' root. fs and root
+	// are immutable.
+	fs   *vfs.Filesystem
+	root *vfs.Dentry
+}
 
 // Name implements vfs.FilesystemType.Name.
-func (FilesystemType) Name() string {
+func (*FilesystemType) Name() string {
 	return Name
 }
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// No data allowed.
 	if opts.Data != "" {
 		return nil, nil, syserror.EINVAL
 	}
 
-	fs, root, err := fstype.newFilesystem(vfsObj, creds)
-	if err != nil {
-		return nil, nil, err
+	fstype.initOnce.Do(func() {
+		fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds)
+		if err != nil {
+			fstype.initErr = err
+			return
+		}
+		fstype.fs = fs.VFSFilesystem()
+		fstype.root = root.VFSDentry()
+	})
+	if fstype.initErr != nil {
+		return nil, nil, fstype.initErr
+	}
+	fstype.fs.IncRef()
+	fstype.root.IncRef()
+	return fstype.fs, fstype.root, nil
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (fstype *FilesystemType) Release(ctx context.Context) {
+	if fstype.fs != nil {
+		fstype.root.DecRef(ctx)
+		fstype.fs.DecRef(ctx)
 	}
-	return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -66,7 +93,7 @@ type filesystem struct {
 
 // newFilesystem creates a new devpts filesystem with root directory and ptmx
 // master inode. It returns the filesystem and root Dentry.
-func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -79,27 +106,29 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 
 	// Construct the root directory. This is always inode id 1.
 	root := &rootInode{
-		slaves: make(map[uint32]*slaveInode),
+		replicas: make(map[uint32]*replicaInode),
 	}
-	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
+	root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
 	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	root.dentry.Init(root)
+	root.EnableLeakCheck()
+
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
 
 	// Construct the pts master inode and dentry. Linux always uses inode
 	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
 	master := &masterInode{
 		root: root,
 	}
-	master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
-	master.dentry.Init(master)
+	master.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
 
 	// Add the master as a child of the root.
-	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
-		"ptmx": &master.dentry,
+	links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
+		"ptmx": master,
 	})
 	root.IncLinks(links)
 
-	return fs, &root.dentry, nil
+	return fs, &rootD, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -109,29 +138,28 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
 type rootInode struct {
-	kernfs.AlwaysValid
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
+	rootInodeRefs
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// master is the master pty inode. Immutable.
 	master *masterInode
 
-	// root is the root directory inode for this filesystem. Immutable.
-	root *rootInode
-
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
-	// slaves maps pty ids to slave inodes.
-	slaves map[uint32]*slaveInode
+	// replicas maps pty ids to replica inodes.
+	replicas map[uint32]*replicaInode
 
 	// nextIdx is the next pty index to use. Must be accessed atomically.
 	//
@@ -142,7 +170,7 @@ type rootInode struct {
 var _ kernfs.Inode = (*rootInode)(nil)
 
 // allocateTerminal creates a new Terminal and installs a pts node for it.
-func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 	if i.nextIdx == math.MaxUint32 {
@@ -151,41 +179,46 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
 	idx := i.nextIdx
 	i.nextIdx++
 
-	// Sanity check that slave with idx does not exist.
-	if _, ok := i.slaves[idx]; ok {
+	// Sanity check that replica with idx does not exist.
+	if _, ok := i.replicas[idx]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
 	}
 
-	// Create the new terminal and slave.
+	// Create the new terminal and replica.
 	t := newTerminal(idx)
-	slave := &slaveInode{
+	replica := &replicaInode{
 		root: i,
 		t:    t,
 	}
 	// Linux always uses pty index + 3 as the inode id. See
 	// fs/devpts/inode.c:devpts_pty_new().
-	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
-	slave.dentry.Init(slave)
-	i.slaves[idx] = slave
+	replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	i.replicas[idx] = replica
 
 	return t, nil
 }
 
 // masterClose is called when the master end of t is closed.
-func (i *rootInode) masterClose(t *Terminal) {
+func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 
-	// Sanity check that slave with idx exists.
-	if _, ok := i.slaves[t.n]; !ok {
+	// Sanity check that replica with idx exists.
+	ri, ok := i.replicas[t.n]
+	if !ok {
 		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
 	}
-	delete(i.slaves, t.n)
+
+	// Drop the ref on replica inode taken during rootInode.allocateTerminal.
+	ri.DecRef(ctx)
+	delete(i.replicas, t.n)
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -193,27 +226,34 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 }
 
 // Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry.
 	idx, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
 	}
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	if si, ok := i.slaves[uint32(idx)]; ok {
-		si.dentry.IncRef()
-		return si.dentry.VFSDentry(), nil
+	if ri, ok := i.replicas[uint32(idx)]; ok {
+		ri.IncRef() // This ref is passed to the dentry upon creation via Init.
+		return ri, nil
 
 	}
 	return nil, syserror.ENOENT
 }
 
 // IterDirents implements kernfs.Inode.IterDirents.
-func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	ids := make([]int, 0, len(i.slaves))
-	for id := range i.slaves {
+	i.InodeAttrs.TouchAtime(ctx, mnt)
+	ids := make([]int, 0, len(i.replicas))
+	for id := range i.replicas {
 		ids = append(ids, int(id))
 	}
 	sort.Ints(ids)
@@ -221,7 +261,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(id), 10),
 			Type:    linux.DT_CHR,
-			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			Ino:     i.replicas[uint32(id)].InodeAttrs.Ino(),
 			NextOff: offset + 1,
 		}
 		if err := cb.Handle(dirent); err != nil {
@@ -231,3 +271,16 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 	}
 	return offset, nil
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+	i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..ae95fdd08 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -52,8 +53,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -62,7 +63,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -99,10 +100,10 @@ type lineDiscipline struct {
 	column int
 
 	// masterWaiter is used to wait on the master end of the TTY.
-	masterWaiter waiter.Queue `state:"zerovalue"`
+	masterWaiter waiter.Queue
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3bb397f71..e91fa26a4 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,9 +17,11 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -29,7 +31,10 @@ import (
 )
 
 // masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
 type masterInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -37,9 +42,6 @@ type masterInode struct {
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// root is the devpts root inode.
 	root *rootInode
 }
@@ -47,20 +49,18 @@ type masterInode struct {
 var _ kernfs.Inode = (*masterInode)(nil)
 
 // Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	t, err := mi.root.allocateTerminal(rp.Credentials())
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	t, err := mi.root.allocateTerminal(ctx, rp.Credentials())
 	if err != nil {
 		return nil, err
 	}
 
-	mi.IncRef()
 	fd := &masterFileDescription{
 		inode: mi,
 		t:     t,
 	}
 	fd.LockFD.Init(&mi.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		mi.DecRef(ctx)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,6 +86,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
 }
 
+// +stateify savable
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -99,8 +100,7 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (mfd *masterFileDescription) Release(ctx context.Context) {
-	mfd.inode.root.masterClose(mfd.t)
-	mfd.inode.DecRef(ctx)
+	mfd.inode.root.masterClose(ctx, mfd.t)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -130,46 +130,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
 
 // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
-		return mfd.t.ld.getTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mfd.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mfd.t.ld.windowSize(ctx, io, args)
+		return 0, mfd.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+		return 0, mfd.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..70c68cf0a
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,201 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
+type replicaInode struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &replicaFileDescription{
+		inode: ri,
+	}
+	fd.LockFD.Init(&ri.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (ri *replicaInode) Valid(context.Context) bool {
+	// Return valid if the replica still exists.
+	ri.root.mu.Lock()
+	defer ri.root.mu.Unlock()
+	_, ok := ri.root.replicas[ri.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+	statx.RdevMinor = ri.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+// +stateify savable
+type replicaFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+	rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return rfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return rfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
+	case linux.TCGETS:
+		return rfd.inode.t.ld.getTermios(t, args)
+	case linux.TCSETS:
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TIOCGPTN:
+		nP := primitive.Uint32(rfd.inode.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, rfd.inode.t.ld.windowSize(t, args)
+	case linux.TIOCSWINSZ:
+		return 0, rfd.inode.t.ld.setWindowSize(t, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index 32e4e1908..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
-	kernfs.InodeAttrs
-	kernfs.InodeNoopRefCount
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
-	// root is the devpts root inode.
-	root *rootInode
-
-	// t is the connected Terminal.
-	t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	si.IncRef()
-	fd := &slaveFileDescription{
-		inode: si,
-	}
-	fd.LockFD.Init(&si.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		si.DecRef(ctx)
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
-	// Return valid if the slave still exists.
-	si.root.mu.Lock()
-	defer si.root.mu.Unlock()
-	_, ok := si.root.slaves[si.t.n]
-	return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	statx.Blksize = 1024
-	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
-	statx.RdevMinor = si.t.n
-	return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
-		return syserror.EINVAL
-	}
-	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-	vfs.LockFD
-
-	inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
-	sfd.inode.DecRef(ctx)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
-	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch cmd := args[1].Uint(); cmd {
-	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
-		// Get the number of bytes in the input queue read buffer.
-		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
-	case linux.TCGETS:
-		return sfd.inode.t.ld.getTermios(ctx, io, args)
-	case linux.TCSETS:
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TCSETSW:
-		// TODO(b/29356795): This should drain the output queue first.
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	case linux.TIOCGWINSZ:
-		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
-	case linux.TIOCSWINSZ:
-		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
-	default:
-		maybeEmitUnimplementedEvent(ctx, cmd)
-		return 0, syserror.ENOTTY
-	}
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	creds := auth.CredentialsFromContext(ctx)
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(ctx, fs, opts)
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	return &t
 }
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..e49a04c1b 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -4,7 +4,10 @@ licenses(["notice"])
 
 go_library(
     name = "devtmpfs",
-    srcs = ["devtmpfs.go"],
+    srcs = [
+        "devtmpfs.go",
+        "save_restore.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -24,6 +27,7 @@ go_test(
     library = ":devtmpfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 2ed5fa8a9..e6fe0fc0d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -32,8 +33,10 @@ import (
 const Name = "devtmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct {
-	initOnce sync.Once
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
 	initErr  error
 
 	// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
@@ -68,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua
 	return fst.fs, fst.root, nil
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (fst *FilesystemType) Release(ctx context.Context) {
+	if fst.fs != nil {
+		// Release the original reference obtained when creating the filesystem.
+		fst.root.DecRef(ctx)
+		fst.fs.DecRef(ctx)
+	}
+}
+
 // Accessor allows devices to create device special files in devtmpfs.
 type Accessor struct {
 	vfsObj *vfs.VirtualFilesystem
@@ -79,14 +91,17 @@ type Accessor struct {
 // NewAccessor returns an Accessor that supports creation of device special
 // files in the devtmpfs instance registered with name fsTypeName in vfsObj.
 func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
 	if err != nil {
 		return nil, err
 	}
+	// Pass a reference on root to the Accessor.
+	root := mntns.Root()
+	root.IncRef()
 	return &Accessor{
 		vfsObj: vfsObj,
 		mntns:  mntns,
-		root:   mntns.Root(),
+		root:   root,
 		creds:  creds,
 	}, nil
 }
@@ -150,13 +165,11 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 
 	// Create any parent directories. See
 	// devtmpfs.c:handle_create()=>path_create().
-	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
-		pop := a.pathOperationAt(it.String())
-		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
-			Mode: 0755,
-		}); err != nil {
-			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
-		}
+	parent := path.Dir(pathname)
+	if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
 	}
 
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 747867cca..e058eda7a 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
 package devtmpfs
 
 import (
+	"path"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,10 +27,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+	t.Helper()
+
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-
 	vfsObj := &vfs.VirtualFilesystem{}
 	if err := vfsObj.Init(ctx); err != nil {
 		t.Fatalf("VFS init: %v", err)
@@ -43,14 +48,12 @@ func TestDevtmpfs(t *testing.T) {
 	})
 
 	// Create a test mount namespace with devtmpfs mounted at "/dev".
-	const devPath = "/dev"
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(ctx)
 	root := mntns.Root()
-	defer root.DecRef(ctx)
+	root.IncRef()
 	devpop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
@@ -61,10 +64,20 @@ func TestDevtmpfs(t *testing.T) {
 	}); err != nil {
 		t.Fatalf("failed to create mount point: %v", err)
 	}
-	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+	if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
 		t.Fatalf("failed to mount devtmpfs: %v", err)
 	}
 
+	return ctx, creds, vfsObj, root, func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	}
+}
+
+func TestUserspaceInit(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
 	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
 	if err != nil {
 		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
@@ -75,48 +88,143 @@ func TestDevtmpfs(t *testing.T) {
 	if err := a.UserspaceInit(ctx); err != nil {
 		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
 	}
+
 	// Created files should be visible in the test mount namespace.
-	abspath := devPath + "/fd"
-	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	})
-	if want := "/proc/self/fd"; err != nil || target != want {
-		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	links := []struct {
+		source string
+		target string
+	}{
+		{
+			source: "fd",
+			target: "/proc/self/fd",
+		},
+		{
+			source: "stdin",
+			target: "/proc/self/fd/0",
+		},
+		{
+			source: "stdout",
+			target: "/proc/self/fd/1",
+		},
+		{
+			source: "stderr",
+			target: "/proc/self/fd/2",
+		},
+		{
+			source: "ptmx",
+			target: "pts/ptmx",
+		},
 	}
 
-	// Create a dummy device special file using a devtmpfs.Accessor.
-	const (
-		pathInDev = "dummy"
-		kind      = vfs.CharDevice
-		major     = 12
-		minor     = 34
-		perms     = 0600
-		wantMode  = linux.S_IFCHR | perms
-	)
-	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
-		t.Fatalf("failed to create device file: %v", err)
+	for _, link := range links {
+		abspath := path.Join(devPath, link.source)
+		if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}); err != nil || gotTarget != link.target {
+			t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+		}
 	}
-	// The device special file should be visible in the test mount namespace.
-	abspath = devPath + "/" + pathInDev
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	}, &vfs.StatOptions{
-		Mask: linux.STATX_TYPE | linux.STATX_MODE,
-	})
-	if err != nil {
-		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+	dirs := []string{"shm", "pts"}
+	for _, dir := range dirs {
+		abspath := path.Join(devPath, dir)
+		statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Errorf("stat(%q): got error %v ", abspath, err)
+			continue
+		}
+		if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+			t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+		}
 	}
-	if stat.Mode != wantMode {
-		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
 	}
-	if stat.RdevMajor != major {
-		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	defer a.Release(ctx)
+
+	devFiles := []struct {
+		path  string
+		kind  vfs.DeviceKind
+		major uint32
+		minor uint32
+		perms uint16
+	}{
+		{
+			path:  "dummy",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
+		{
+			path:  "foo/bar",
+			kind:  vfs.BlockDevice,
+			major: 13,
+			minor: 35,
+			perms: 0660,
+		},
+		{
+			path:  "foo/baz",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 40,
+			perms: 0666,
+		},
+		{
+			path:  "a/b/c/d/e",
+			kind:  vfs.BlockDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
 	}
-	if stat.RdevMinor != minor {
-		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+	for _, f := range devFiles {
+		if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+			t.Fatalf("failed to create device file: %v", err)
+		}
+		// The device special file should be visible in the test mount namespace.
+		abspath := path.Join(devPath, f.path)
+		stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE | linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+		}
+		if stat.RdevMajor != f.major {
+			t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+		}
+		if stat.RdevMinor != f.minor {
+			t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+		}
+		wantMode := f.perms
+		switch f.kind {
+		case vfs.CharDevice:
+			wantMode |= linux.S_IFCHR
+		case vfs.BlockDevice:
+			wantMode |= linux.S_IFBLK
+		}
+		if stat.Mode != wantMode {
+			t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+		}
 	}
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/save_restore.go b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
new file mode 100644
index 000000000..28832d850
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devtmpfs
+
+// afterLoad is invoked by stateify.
+func (fst *FilesystemType) afterLoad() {
+	if fst.fs != nil {
+		// Ensure that we don't create another filesystem.
+		fst.initOnce.Do(func() {})
+	}
+}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..5b29f2358 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,9 +30,11 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in
 // certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
 type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -41,7 +43,7 @@ type EventFileDescription struct {
 
 	// queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	queue waiter.Queue `state:"zerovalue"`
+	queue waiter.Queue
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -106,7 +108,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
 	return efd.hostfd, nil
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (efd *EventFileDescription) Release(context.Context) {
 	efd.mu.Lock()
 	defer efd.mu.Unlock()
@@ -119,7 +121,7 @@ func (efd *EventFileDescription) Release(context.Context) {
 	}
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	if dst.NumBytes() < 8 {
 		return 0, syscall.EINVAL
@@ -130,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
 	return 8, nil
 }
 
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
 	if src.NumBytes() < 8 {
 		return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index abc610ef3..7b1eec3da 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -51,6 +51,8 @@ go_library(
         "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
@@ -86,9 +88,9 @@ go_test(
     library = ":ext",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 8f7d5a9bb..2ee7cc7ac 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,13 +59,18 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
@@ -90,7 +95,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+	if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			InternalData: int(f.Fd()),
 		},
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index 8bb104ff0..1165234f9 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -18,7 +18,7 @@ import (
 	"io"
 	"math"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -34,19 +34,19 @@ type blockMapFile struct {
 
 	// directBlks are the direct blocks numbers. The physical blocks pointed by
 	// these holds file data. Contains file blocks 0 to 11.
-	directBlks [numDirectBlks]uint32
+	directBlks [numDirectBlks]primitive.Uint32
 
 	// indirectBlk is the physical block which contains (blkSize/4) direct block
 	// numbers (as uint32 integers).
-	indirectBlk uint32
+	indirectBlk primitive.Uint32
 
 	// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
 	// block numbers (as uint32 integers).
-	doubleIndirectBlk uint32
+	doubleIndirectBlk primitive.Uint32
 
 	// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
 	// indirect block numbers (as uint32 integers).
-	tripleIndirectBlk uint32
+	tripleIndirectBlk primitive.Uint32
 
 	// coverage at (i)th index indicates the amount of file data a node at
 	// height (i) covers. Height 0 is the direct block.
@@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
 	}
 
 	blkMap := file.regFile.inode.diskInode.Data()
-	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
-	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+	for i := 0; i < numDirectBlks; i++ {
+		file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4])
+	}
+	file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4])
+	file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4])
+	file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4])
 	return file, nil
 }
 
@@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 		switch {
 		case offset < dirBlksEnd:
 			// Direct block.
-			curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+			curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:])
 		case offset < indirBlkEnd:
 			// Indirect block.
-			curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+			curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:])
 		case offset < doubIndirBlkEnd:
 			// Doubly indirect block.
-			curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+			curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:])
 		default:
 			// Triply indirect block.
-			curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+			curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:])
 		}
 
 		read += curR
@@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	read := 0
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
-		var childPhyBlk uint32
+		var childPhyBlk primitive.Uint32
 		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
 
-		n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+		n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:])
 		read += n
 		if err != nil {
 			return read, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 6fa84e7aa..ed98b482e 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -20,7 +20,7 @@ import (
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
 	var fileData []byte
 	blkNums := newBlkNumGen()
-	var data []byte
+	off := 0
+	data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes())
 
 	// Write the direct blocks.
 	for i := 0; i < numDirectBlks; i++ {
-		curBlkNum := blkNums.next()
-		data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
-		fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(data[off:])
+		off += curBlkNum.SizeBytes()
+		fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...)
 	}
 
 	// Write to indirect block.
-	indirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
-
-	// Write to indirect block.
-	doublyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
-
-	// Write to indirect block.
-	triplyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+	indirectBlk := primitive.Uint32(blkNums.next())
+	indirectBlk.MarshalBytes(data[off:])
+	off += indirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...)
+
+	// Write to double indirect block.
+	doublyIndirectBlk := primitive.Uint32(blkNums.next())
+	doublyIndirectBlk.MarshalBytes(data[off:])
+	off += doublyIndirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...)
+
+	// Write to triple indirect block.
+	triplyIndirectBlk := primitive.Uint32(blkNums.next())
+	triplyIndirectBlk.MarshalBytes(data[off:])
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...)
 
 	args := inodeArgs{
 		fs: &filesystem{
@@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN
 
 	var fileData []byte
 	for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
-		curBlkNum := blkNums.next()
-		copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
-		fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(disk[off : off+4])
+		fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...)
 	}
 	return fileData
 }
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 7a1b4219f..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -20,6 +20,8 @@ import (
 )
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0fc01668d..0ad79b381 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -28,6 +27,8 @@ import (
 )
 
 // directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
 type directory struct {
 	inode inode
 
@@ -39,7 +40,7 @@ type directory struct {
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
 	//     filesystem.mu
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// childList is a list containing (1) child dirents and (2) fake dirents
 	// (with diskDirent == nil) that represent the iteration position of
@@ -98,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
 		} else {
 			curDirent.diskDirent = &disklayout.DirentOld{}
 		}
-		binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+		curDirent.diskDirent.UnmarshalBytes(buf)
 
 		if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
 			// Inode number and name length fields being set to 0 is used to indicate
@@ -120,6 +121,8 @@ func (i *inode) isDir() bool {
 }
 
 // dirent is the directory.childList node.
+//
+// +stateify savable
 type dirent struct {
 	diskDirent disklayout.Dirent
 
@@ -129,6 +132,8 @@ type dirent struct {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 9bd9c76c0..d98a05dd8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,10 +22,11 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
+    marshal = True,
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
+        "//pkg/marshal",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..0d56ae9da 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // BlockGroup represents a Linux ext block group descriptor. An ext file system
 // is split into a series of block groups. This provides an access layer to
 // information needed to access and use a block group.
@@ -30,6 +34,8 @@ package disklayout
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
 type BlockGroup interface {
+	marshal.Marshallable
+
 	// InodeTable returns the absolute block number of the block containing the
 	// inode table. This points to an array of Inode structs. Inode tables are
 	// statically allocated at mkfs time. The superblock records the number of
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..a35fa22a0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
@@ -17,6 +17,8 @@ package disklayout
 // BlockGroup32Bit emulates the first half of struct ext4_group_desc in
 // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
 // 32-bit ext4 filesystems. It implements BlockGroup interface.
+//
+// +marshal
 type BlockGroup32Bit struct {
 	BlockBitmapLo         uint32
 	InodeBitmapLo         uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..d54d1d345 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
@@ -18,6 +18,8 @@ package disklayout
 // It is the block group descriptor struct for 64-bit ext4 filesystems.
 // It implements BlockGroup interface. It is an extension of the 32-bit
 // version of BlockGroup.
+//
+// +marshal
 type BlockGroup64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..e4ce484e4 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
@@ -21,6 +21,8 @@ import (
 // TestBlockGroupSize tests that the block group descriptor structs are of the
 // correct size.
 func TestBlockGroupSize(t *testing.T) {
-	assertSize(t, BlockGroup32Bit{}, 32)
-	assertSize(t, BlockGroup64Bit{}, 64)
+	var bgSmall BlockGroup32Bit
+	assertSize(t, &bgSmall, 32)
+	var bgBig BlockGroup64Bit
+	assertSize(t, &bgBig, 64)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 417b6cf65..568c8cb4c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -15,6 +15,7 @@
 package disklayout
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
@@ -51,6 +52,8 @@ var (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
 type Dirent interface {
+	marshal.Marshallable
+
 	// Inode returns the absolute inode number of the underlying inode.
 	// Inode number 0 signifies an unused dirent.
 	Inode() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..51f9c2946 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
@@ -29,12 +29,14 @@ import (
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentNew struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint8
 	FileTypeRaw  uint8
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentNew implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..d4b19e086 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
@@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs"
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentOld struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint16
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentOld implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index 934919f8a..3486864dc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,6 +21,8 @@ import (
 // TestDirentSize tests that the dirent structs are of the correct
 // size.
 func TestDirentSize(t *testing.T) {
-	assertSize(t, DirentOld{}, uintptr(DirentSize))
-	assertSize(t, DirentNew{}, uintptr(DirentSize))
+	var dOld DirentOld
+	assertSize(t, &dOld, DirentSize)
+	var dNew DirentNew
+	assertSize(t, &dNew, DirentSize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..0834e9ba8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
@@ -36,8 +36,6 @@
 //     escape analysis on an unknown implementation at compile time.
 //
 // Notes:
-//   - All fields in these structs are exported because binary.Read would
-//     panic otherwise.
 //   - All structures on disk are in little-endian order. Only jbd2 (journal)
 //     structures are in big-endian order.
 //   - All OS dependent fields in these structures will be interpretted using
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 4110649ab..b13999bfc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // Extents were introduced in ext4 and provide huge performance gains in terms
 // data locality and reduced metadata block usage. Extents are organized in
 // extent trees. The root node is contained in inode.BlocksRaw.
@@ -64,6 +68,8 @@ type ExtentNode struct {
 // ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
+	marshal.Marshallable
+
 	// FileBlock returns the first file block number covered by this entry.
 	FileBlock() uint32
 
@@ -75,6 +81,8 @@ type ExtentEntry interface {
 // tree node begins with this and is followed by `NumEntries` number of:
 //   - Extent         if `Depth` == 0
 //   - ExtentIdx      otherwise
+//
+// +marshal
 type ExtentHeader struct {
 	// Magic in the extent magic number, must be 0xf30a.
 	Magic uint16
@@ -96,6 +104,8 @@ type ExtentHeader struct {
 // internal nodes. Sorted in ascending order based on FirstFileBlock since
 // Linux does a binary search on this. This points to a block containing the
 // child node.
+//
+// +marshal
 type ExtentIdx struct {
 	FirstFileBlock uint32
 	ChildBlockLo   uint32
@@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 {
 // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a
 // binary search on this. This points to an array of data blocks containing the
 // file data. It covers `Length` data blocks starting from `StartBlock`.
+//
+// +marshal
 type Extent struct {
 	FirstFileBlock uint32
 	Length         uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index 8762b90db..c96002e19 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,10 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
-	assertSize(t, ExtentIdx{}, ExtentEntrySize)
-	assertSize(t, Extent{}, ExtentEntrySize)
+	var h ExtentHeader
+	assertSize(t, &h, ExtentHeaderSize)
+	var i ExtentIdx
+	assertSize(t, &i, ExtentEntrySize)
+	var e Extent
+	assertSize(t, &e, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..ef25040a9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
@@ -16,6 +16,7 @@ package disklayout
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
@@ -38,6 +39,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
 type Inode interface {
+	marshal.Marshallable
+
 	// Mode returns the linux file mode which is majorly used to extract
 	// information like:
 	// - File permissions (read/write/execute by user/group/others).
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..a4503f5cf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
@@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 // are used to provide nanoscond precision. Hence, these timestamps will now
 // overflow in May 2446.
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+//
+// +marshal
 type InodeNew struct {
 	InodeOld
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..e6b28babf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
@@ -30,6 +30,8 @@ const (
 //
 // All fields representing time are in seconds since the epoch. Which means that
 // they will overflow in January 2038.
+//
+// +marshal
 type InodeOld struct {
 	ModeRaw uint16
 	UIDLo   uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..90744e956 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
@@ -24,10 +24,12 @@ import (
 
 // TestInodeSize tests that the inode structs are of the correct size.
 func TestInodeSize(t *testing.T) {
-	assertSize(t, InodeOld{}, OldInodeSize)
+	var iOld InodeOld
+	assertSize(t, &iOld, OldInodeSize)
 
 	// This was updated from 156 bytes to 160 bytes in Oct 2015.
-	assertSize(t, InodeNew{}, 160)
+	var iNew InodeNew
+	assertSize(t, &iNew, 160)
 }
 
 // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 8bb327006..70948ebe9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 const (
 	// SbOffset is the absolute offset at which the superblock is placed.
 	SbOffset = 1024
@@ -38,6 +42,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
 type SuperBlock interface {
+	marshal.Marshallable
+
 	// InodesCount returns the total number of inodes in this filesystem.
 	InodesCount() uint32
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..4dc6080fb 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
@@ -17,6 +17,8 @@ package disklayout
 // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
 // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
 // RevLevel = DynamicRev and 64-bit feature is disabled.
+//
+// +marshal
 type SuperBlock32Bit struct {
 	// We embed the old superblock struct here because the 32-bit version is just
 	// an extension of the old version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..2c9039327 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
@@ -19,6 +19,8 @@ package disklayout
 // 1024 bytes (smallest possible block size) and hence the superblock always
 // fits in no more than one data block. Should only be used when the 64-bit
 // feature is set.
+//
+// +marshal
 type SuperBlock64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..e4709f23c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
@@ -16,6 +16,8 @@ package disklayout
 
 // SuperBlockOld implements SuperBlock and represents the old version of the
 // superblock struct. Should be used only if RevLevel = OldRev.
+//
+// +marshal
 type SuperBlockOld struct {
 	InodesCountRaw      uint32
 	BlocksCountLo       uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..b734b6987 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
@@ -21,7 +21,10 @@ import (
 // TestSuperBlockSize tests that the superblock structs are of the correct
 // size.
 func TestSuperBlockSize(t *testing.T) {
-	assertSize(t, SuperBlockOld{}, 84)
-	assertSize(t, SuperBlock32Bit{}, 336)
-	assertSize(t, SuperBlock64Bit{}, 1024)
+	var sbOld SuperBlockOld
+	assertSize(t, &sbOld, 84)
+	var sb32 SuperBlock32Bit
+	assertSize(t, &sb32, 336)
+	var sb64 SuperBlock64Bit
+	assertSize(t, &sb64, 1024)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..a4bc08411 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
@@ -18,13 +18,13 @@ import (
 	"reflect"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 )
 
-func assertSize(t *testing.T, v interface{}, want uintptr) {
+func assertSize(t *testing.T, v marshal.Marshallable, want int) {
 	t.Helper()
 
-	if got := binary.Size(v); got != want {
+	if got := v.SizeBytes(); got != want {
 		t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
 	}
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 08ffc2834..38fb7962b 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,11 +34,10 @@ import (
 const Name = "ext"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
-// Compiles only if FilesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // getDeviceFd returns an io.ReaderAt to the underlying device.
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
@@ -99,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..d9fd4590c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,13 +71,18 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index c36225a7c..778460107 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -18,12 +18,13 @@ import (
 	"io"
 	"sort"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
 type extentFile struct {
 	regFile regularFile
 
@@ -58,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
+	f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize])
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -77,7 +78,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
+		curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize])
 		f.root.Entries[i].Entry = curEntry
 	}
 
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index cd10d46ee..985f76ac0 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -21,7 +21,6 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 // writeTree writes the tree represented by `root` to the inode and disk. It
 // also writes random file data on disk.
 func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
-	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+	rootData := in.diskInode.Data()
+	root.Header.MarshalBytes(rootData)
+	off := root.Header.SizeBytes()
 	for _, ep := range root.Entries {
-		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(rootData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(in.diskInode.Data(), rootData)
-
 	var fileData []byte
 	for _, ep := range root.Entries {
 		if root.Header.Height == 0 {
@@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl
 // writeTreeToDisk is the recursive step for writeTree which writes the tree
 // on the disk only. Also writes random file data on disk.
 func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
-	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+	nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:]
+	curNode.Node.Header.MarshalBytes(nodeData)
+	off := curNode.Node.Header.SizeBytes()
 	for _, ep := range curNode.Node.Entries {
-		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(nodeData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
-
 	var fileData []byte
 	for _, ep := range curNode.Node.Entries {
 		if curNode.Node.Header.Height == 0 {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index c714ddf73..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
 )
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// dev represents the underlying fs device. It does not require protection
 	// because io.ReaderAt permits concurrent read calls to it. It translates to
@@ -81,9 +83,9 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
-//     - inode == vfsd.Impl().(*Dentry).inode.
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
@@ -166,7 +168,7 @@ func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, in
 // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
+// * filesystem.mu must be locked (for writing if write param is true).
 func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -194,8 +196,8 @@ func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.De
 // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
 func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -490,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
@@ -504,8 +506,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
@@ -513,8 +515,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return "", err
@@ -522,8 +524,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
@@ -531,8 +533,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 30636cf66..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
 //           |-- regular--
 //                       |-- extent file
 //                       |-- block map file
+//
+// +stateify savable
 type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index e73e740d6..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -31,6 +31,8 @@ import (
 // regularFile represents a regular file's inode. This too follows the
 // inheritance pattern prevelant in the vfs layer described in
 // pkg/sentry/vfs/README.md.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
@@ -67,6 +69,8 @@ func (in *inode) isRegular() bool {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 	vfs.LockFD
@@ -75,7 +79,7 @@ type regularFileFD struct {
 	off int64
 
 	// offMu serializes operations that may mutate off.
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
 )
 
 // symlink represents a symlink inode.
+//
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
@@ -61,9 +63,11 @@ func (in *inode) isSymlink() bool {
 	return ok
 }
 
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
 // vfs.FileDescriptionImpl. which may only be used if open options contains
 // O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
 type symlinkFD struct {
 	fileDescription
 	vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index d8b728f8c..58ef7b9b8 100644
--- a/pkg/sentry/fsimpl/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -17,21 +17,21 @@ package ext
 import (
 	"io"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
-	n := binary.Size(v)
+func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error {
+	n := v.SizeBytes()
 	buf := make([]byte, n)
 	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}
 
-	binary.Unmarshal(buf, binary.LittleEndian, v)
+	v.UnmarshalBytes(buf)
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 999111deb..2158b1bbc 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -15,21 +15,42 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "fuse",
+    prefix = "inode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
 go_library(
     name = "fuse",
     srcs = [
         "connection.go",
+        "connection_control.go",
         "dev.go",
+        "directory.go",
+        "file.go",
         "fusefs.go",
-        "init.go",
+        "inode_refs.go",
+        "read_write.go",
         "register.go",
+        "regular_file.go",
         "request_list.go",
+        "request_response.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/safemem",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
@@ -39,7 +60,6 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -47,10 +67,15 @@ go_library(
 go_test(
     name = "fuse_test",
     size = "small",
-    srcs = ["dev_test.go"],
+    srcs = [
+        "connection_test.go",
+        "dev_test.go",
+        "utils_test.go",
+    ],
     library = ":fuse",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/marshal",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -58,6 +83,5 @@ go_test(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index 6df2728ab..8ccda1264 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,31 +15,17 @@
 package fuse
 
 import (
-	"errors"
-	"fmt"
 	"sync"
-	"sync/atomic"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
 const (
 	// fuseDefaultMaxBackground is the default value for MaxBackground.
 	fuseDefaultMaxBackground = 12
@@ -52,43 +38,39 @@ const (
 	fuseDefaultMaxPagesPerReq = 32
 )
 
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
 //
-// +stateify savable
-type Request struct {
-	requestEntry
-
-	id   linux.FUSEOpID
-	hdr  *linux.FUSEHeaderIn
-	data []byte
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
 //
 // +stateify savable
-type Response struct {
-	opcode linux.FUSEOpcode
-	hdr    linux.FUSEHeaderOut
-	data   []byte
-}
-
-// connection is the struct by which the sentry communicates with the FUSE server daemon.
 type connection struct {
 	fd *DeviceFD
 
+	// mu protects access to struct memebers.
+	mu sync.Mutex `state:"nosave"`
+
+	// attributeVersion is the version of connection's attributes.
+	attributeVersion uint64
+
+	// We target FUSE 7.23.
 	// The following FUSE_INIT flags are currently unsupported by this implementation:
-	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
 	// - FUSE_EXPORT_SUPPORT
-	// - FUSE_HANDLE_KILLPRIV
 	// - FUSE_POSIX_LOCKS: requires POSIX locks
 	// - FUSE_FLOCK_LOCKS: requires POSIX locks
 	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
-	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
 	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
 	// - FUSE_ASYNC_DIO
-	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+	// - FUSE_PARALLEL_DIROPS (7.25)
+	// - FUSE_HANDLE_KILLPRIV (7.26)
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+	// - FUSE_ABORT_ERROR (7.27)
+	// - FUSE_CACHE_SYMLINKS (7.28)
+	// - FUSE_NO_OPENDIR_SUPPORT (7.29)
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+	// - FUSE_MAP_ALIGNMENT (7.31)
 
 	// initialized after receiving FUSE_INIT reply.
 	// Until it's set, suspend sending FUSE requests.
@@ -96,11 +78,7 @@ type connection struct {
 	initialized int32
 
 	// initializedChan is used to block requests before initialization.
-	initializedChan chan struct{}
-
-	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
-	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
-	blocked bool
+	initializedChan chan struct{} `state:".(bool)"`
 
 	// connected (connection established) when a new FUSE file system is created.
 	// Set to false when:
@@ -109,48 +87,55 @@ type connection struct {
 	//   device release.
 	connected bool
 
-	// aborted via sysfs.
-	// TODO(gvisor.dev/issue/3185): abort all queued requests.
-	aborted bool
-
 	// connInitError if FUSE_INIT encountered error (major version mismatch).
 	// Only set in INIT.
 	connInitError bool
 
 	// connInitSuccess if FUSE_INIT is successful.
 	// Only set in INIT.
-	// Used for destory.
+	// Used for destory (not yet implemented).
 	connInitSuccess bool
 
-	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
-
-	// NumberBackground is the number of requests in the background.
-	numBackground uint16
+	// aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+	// Set only if abortErr is true and via fuse control fs (not yet implemented).
+	// TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+	aborted bool
 
-	// congestionThreshold for NumBackground.
-	// Negotiated in FUSE_INIT.
-	congestionThreshold uint16
+	// numWating is the number of requests waiting to be
+	// sent to FUSE device or being processed by FUSE daemon.
+	numWaiting uint32
 
-	// maxBackground is the maximum number of NumBackground.
-	// Block connection when it is reached.
-	// Negotiated in FUSE_INIT.
-	maxBackground uint16
+	// Terminology note:
+	//
+	// - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+	//
+	// - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+	//
+	// We call the "background" requests in unix term as async requests.
+	// The "async requests" in unix term is our async requests that expect a reply,
+	// i.e. `!request.noReply`
 
-	// numActiveBackground is the number of requests in background and has being marked as active.
-	numActiveBackground uint16
+	// asyncMu protects the async request fields.
+	asyncMu sync.Mutex `state:"nosave"`
 
-	// numWating is the number of requests waiting for completion.
-	numWaiting uint32
+	// asyncNum is the number of async requests.
+	// Protected by asyncMu.
+	asyncNum uint16
 
-	// TODO(gvisor.dev/issue/3185): BgQueue
-	// some queue for background queued requests.
+	// asyncCongestionThreshold the number of async requests.
+	// Negotiated in FUSE_INIT as "CongestionThreshold".
+	// TODO(gvisor.dev/issue/3529): add congestion control.
+	// Protected by asyncMu.
+	asyncCongestionThreshold uint16
 
-	// bgLock protects:
-	// MaxBackground, CongestionThreshold, NumBackground,
-	// NumActiveBackground, BgQueue, Blocked.
-	bgLock sync.Mutex
+	// asyncNumMax is the maximum number of asyncNum.
+	// Connection blocks the async requests when it is reached.
+	// Negotiated in FUSE_INIT as "MaxBackground".
+	// Protected by asyncMu.
+	asyncNumMax uint16
 
 	// maxRead is the maximum size of a read buffer in in bytes.
+	// Initialized from a fuse fs parameter.
 	maxRead uint32
 
 	// maxWrite is the maximum size of a write buffer in bytes.
@@ -165,23 +150,20 @@ type connection struct {
 	// Negotiated and only set in INIT.
 	minor uint32
 
-	// asyncRead if read pages asynchronously.
+	// atomicOTrunc is true when FUSE does not send a separate SETATTR request
+	// before open with O_TRUNC flag.
 	// Negotiated and only set in INIT.
-	asyncRead bool
+	atomicOTrunc bool
 
-	// abortErr is true if kernel need to return an unique read error after abort.
+	// asyncRead if read pages asynchronously.
 	// Negotiated and only set in INIT.
-	abortErr bool
+	asyncRead bool
 
 	// writebackCache is true for write-back cache policy,
 	// false for write-through policy.
 	// Negotiated and only set in INIT.
 	writebackCache bool
 
-	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
-	// Negotiated and only set in INIT.
-	cacheSymlinks bool
-
 	// bigWrites if doing multi-page cached writes.
 	// Negotiated and only set in INIT.
 	bigWrites bool
@@ -189,116 +171,86 @@ type connection struct {
 	// dontMask if filestestem does not apply umask to creation modes.
 	// Negotiated in INIT.
 	dontMask bool
+
+	// noOpen if FUSE server doesn't support open operation.
+	// This flag only influence performance, not correctness of the program.
+	noOpen bool
+}
+
+func (conn *connection) saveInitializedChan() bool {
+	select {
+	case <-conn.initializedChan:
+		return true // Closed.
+	default:
+		return false // Not closed.
+	}
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+	conn.initializedChan = make(chan struct{}, 1)
+	if closed {
+		close(conn.initializedChan)
+	}
 }
 
 // newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
 	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
 	// mount a FUSE filesystem.
 	fuseFD := fd.Impl().(*DeviceFD)
-	fuseFD.mounted = true
 
 	// Create the writeBuf for the header to be stored in.
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	fuseFD.writeBuf = make([]byte, hdrLen)
 	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
-	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
 	fuseFD.writeCursor = 0
 
 	return &connection{
-		fd:                  fuseFD,
-		maxBackground:       fuseDefaultMaxBackground,
-		congestionThreshold: fuseDefaultCongestionThreshold,
-		maxPages:            fuseDefaultMaxPagesPerReq,
-		initializedChan:     make(chan struct{}),
-		connected:           true,
-	}, nil
-}
-
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
-	// Unblock the requests sent before INIT.
-	close(conn.initializedChan)
-
-	// Close the channel first to avoid the non-atomic situation
-	// where conn.initialized is true but there are
-	// tasks being blocked on the channel.
-	// And it prevents the newer tasks from gaining
-	// unnecessary higher chance to be issued before the blocked one.
-
-	atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
-	return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
-	conn.fd.mu.Lock()
-	defer conn.fd.mu.Unlock()
-	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
-	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
-	hdr := linux.FUSEHeaderIn{
-		Len:    uint32(hdrLen + payload.SizeBytes()),
-		Opcode: opcode,
-		Unique: conn.fd.nextOpID,
-		NodeID: ino,
-		UID:    uint32(creds.EffectiveKUID),
-		GID:    uint32(creds.EffectiveKGID),
-		PID:    pid,
-	}
-
-	buf := make([]byte, hdr.Len)
-	hdr.MarshalUnsafe(buf[:hdrLen])
-	payload.MarshalUnsafe(buf[hdrLen:])
-
-	return &Request{
-		id:   hdr.Unique,
-		hdr:  &hdr,
-		data: buf,
+		fd:                       fuseFD,
+		asyncNumMax:              fuseDefaultMaxBackground,
+		asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+		maxRead:                  opts.maxRead,
+		maxPages:                 fuseDefaultMaxPagesPerReq,
+		initializedChan:          make(chan struct{}),
+		connected:                true,
 	}, nil
 }
 
-// Call makes a request to the server and blocks the invoking task until a
-// server responds with a response. Task should never be nil.
-// Requests will not be sent before the connection is initialized.
-// For async tasks, use CallAsync().
-func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
-	// Block requests sent before connection is initalized.
-	if !conn.Initialized() {
-		if err := t.Block(conn.initializedChan); err != nil {
-			return nil, err
-		}
-	}
-
-	return conn.call(t, r)
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	r.async = true
+	_, err := conn.Call(t, r)
+	return err
 }
 
-// CallAsync makes an async (aka background) request.
-// Those requests either do not expect a response (e.g. release) or
-// the response should be handled by others (e.g. init).
-// Return immediately unless the connection is blocked (before initialization).
-// Async call example: init, release, forget, aio, interrupt.
+// Call makes a request to the server.
+// Block before the connection is initialized.
 // When the Request is FUSE_INIT, it will not be blocked before initialization.
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
 	// Block requests sent before connection is initalized.
 	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
 		if err := t.Block(conn.initializedChan); err != nil {
-			return err
+			return nil, err
 		}
 	}
 
-	// This should be the only place that invokes call() with a nil task.
-	_, err := conn.call(nil, r)
-	return err
-}
-
-// call makes a call without blocking checks.
-func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	if !conn.connected {
 		return nil, syserror.ENOTCONN
 	}
@@ -315,31 +267,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	return fut.resolve(t)
 }
 
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
-	errno := r.hdr.Error
-	if errno >= 0 {
-		return nil
-	}
-
-	sysErrNo := syscall.Errno(-errno)
-	return error(sysErrNo)
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
-	hdrLen := r.hdr.SizeBytes()
-	haveDataLen := r.hdr.Len - uint32(hdrLen)
-	wantDataLen := uint32(m.SizeBytes())
-
-	if haveDataLen < wantDataLen {
-		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
-	}
-
-	m.UnmarshalUnsafe(r.data[hdrLen:])
-	return nil
-}
-
 // callFuture makes a request to the server and returns a future response.
 // Call resolve() when the response needs to be fulfilled.
 func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -358,11 +285,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 	// if there are always too many ongoing requests all the time. The
 	// supported maxActiveRequests setting should be really high to avoid this.
 	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
-		if t == nil {
-			// Since there is no task that is waiting. We must error out.
-			return nil, errors.New("FUSE request queue full")
-		}
-
 		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
 			r.id, conn.fd.numActiveRequests)
 		conn.fd.mu.Unlock()
@@ -378,9 +300,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 
 // callFutureLocked makes a request to the server and returns a future response.
 func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	// Check connected again holding conn.mu.
+	conn.mu.Lock()
+	if !conn.connected {
+		conn.mu.Unlock()
+		// we checked connected before,
+		// this must be due to aborted connection.
+		return nil, syserror.ECONNABORTED
+	}
+	conn.mu.Unlock()
+
 	conn.fd.queue.PushBack(r)
-	conn.fd.numActiveRequests += 1
-	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.numActiveRequests++
+	fut := newFutureResponse(r)
 	conn.fd.completions[r.id] = fut
 
 	// Signal the readers that there is something to read.
@@ -388,50 +320,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
 
 	return fut, nil
 }
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
-	opcode linux.FUSEOpcode
-	ch     chan struct{}
-	hdr    *linux.FUSEHeaderOut
-	data   []byte
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
-	return &futureResponse{
-		opcode: opcode,
-		ch:     make(chan struct{}),
-	}
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
-	// If there is no Task associated with this request  - then we don't try to resolve
-	// the response.  Instead, the task writing the response (proxy to the server) will
-	// process the response on our behalf.
-	if t == nil {
-		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
-		return nil, nil
-	}
-
-	if err := t.Block(f.ch); err != nil {
-		return nil, err
-	}
-
-	return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
-	return &Response{
-		opcode: f.opcode,
-		hdr:    *f.hdr,
-		data:   f.data,
-	}
-}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go
index 779c2bd3f..bfde78559 100644
--- a/pkg/sentry/fsimpl/fuse/init.go
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -15,7 +15,11 @@
 package fuse
 
 import (
+	"sync/atomic"
+	"syscall"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -29,9 +33,10 @@ const (
 	// Follow the same behavior as unix fuse implementation.
 	fuseMaxTimeGranNs = 1000000000
 
-	// Minimum value for MaxWrite.
+	// Minimum value for MaxWrite and MaxRead.
 	// Follow the same behavior as unix fuse implementation.
 	fuseMinMaxWrite = 4096
+	fuseMinMaxRead  = 4096
 
 	// Temporary default value for max readahead, 128kb.
 	fuseDefaultMaxReadahead = 131072
@@ -49,6 +54,26 @@ var (
 	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
 )
 
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
 // InitSend sends a FUSE_INIT request.
 func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 	in := linux.FUSEInitIn{
@@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 }
 
 // InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
 func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
 	if err := res.Error(); err != nil {
 		return err
 	}
 
-	var out linux.FUSEInitOut
-	if err := res.UnmarshalPayload(&out); err != nil {
+	initRes := fuseInitRes{initLen: res.DataLen()}
+	if err := res.UnmarshalPayload(&initRes); err != nil {
 		return err
 	}
 
-	return conn.initProcessReply(&out, hasSysAdminCap)
+	return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
 }
 
 // Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
 func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No matter error or not, always set initialzied.
+	// to unblock the blocked requests.
+	defer conn.SetInitialized()
+
 	// No support for old major fuse versions.
 	if out.Major != linux.FUSE_KERNEL_VERSION {
 		conn.connInitError = true
-
-		// Set the connection as initialized and unblock the blocked requests
-		// (i.e. return error for them).
-		conn.SetInitialized()
-
 		return nil
 	}
 
@@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 	conn.connInitSuccess = true
 	conn.minor = out.Minor
 
-	// No support for limits before minor version 13.
-	if out.Minor >= 13 {
-		conn.bgLock.Lock()
-
-		if out.MaxBackground > 0 {
-			conn.maxBackground = out.MaxBackground
-
-			if !hasSysAdminCap &&
-				conn.maxBackground > MaxUserBackgroundRequest {
-				conn.maxBackground = MaxUserBackgroundRequest
-			}
-		}
-
-		if out.CongestionThreshold > 0 {
-			conn.congestionThreshold = out.CongestionThreshold
-
-			if !hasSysAdminCap &&
-				conn.congestionThreshold > MaxUserCongestionThreshold {
-				conn.congestionThreshold = MaxUserCongestionThreshold
-			}
-		}
-
-		conn.bgLock.Unlock()
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
 	}
 
 	// No support for the following flags before minor version 6.
@@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
 		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
 		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
-		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
-		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
 
 		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
 
@@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		}
 	}
 
-	// No support for negotiating MaxWrite before minor version 5.
-	if out.Minor >= 5 {
-		conn.maxWrite = out.MaxWrite
-	} else {
-		conn.maxWrite = fuseMinMaxWrite
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.asyncMu.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.asyncNumMax = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.asyncNumMax > MaxUserBackgroundRequest {
+				conn.asyncNumMax = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.asyncCongestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+				conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.asyncMu.Unlock()
 	}
-	if conn.maxWrite < fuseMinMaxWrite {
-		conn.maxWrite = fuseMinMaxWrite
+
+	return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+	conn.mu.Lock()
+	conn.asyncMu.Lock()
+
+	if !conn.connected {
+		conn.asyncMu.Unlock()
+		conn.mu.Unlock()
+		conn.fd.mu.Unlock()
+		return
 	}
 
-	// Set connection as initialized and unblock the requests
-	// issued before init.
-	conn.SetInitialized()
+	conn.connected = false
 
-	return nil
+	// Empty the `fd.queue` that holds the requests
+	// not yet read by the FUSE daemon yet.
+	// These are a subset of the requests in `fuse.completion` map.
+	for !conn.fd.queue.Empty() {
+		req := conn.fd.queue.Front()
+		conn.fd.queue.Remove(req)
+	}
+
+	var terminate []linux.FUSEOpID
+
+	// 2. Collect the requests have not been sent to FUSE daemon,
+	// or have not received a reply.
+	for unique := range conn.fd.completions {
+		terminate = append(terminate, unique)
+	}
+
+	// Release locks to avoid deadlock.
+	conn.asyncMu.Unlock()
+	conn.mu.Unlock()
+
+	// 1. The requets blocked before initialization.
+	// Will reach call() `connected` check and return.
+	if !conn.Initialized() {
+		conn.SetInitialized()
+	}
+
+	// 2. Terminate the requests collected above.
+	// Set ECONNABORTED error.
+	// sendError() will remove them from `fd.completion` map.
+	// Will enter the path of a normally received error.
+	for _, toTerminate := range terminate {
+		conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+	}
+
+	// 3. The requests not yet written to FUSE device.
+	// Early terminate.
+	// Will reach callFutureLocked() `connected` check and return.
+	close(conn.fd.fullQueueCh)
+
+	// TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
 }
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"math/rand"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+
+	conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	select {
+	case <-conn.initializedChan:
+		t.Fatalf("initializedChan should be blocking before SetInitialized")
+	default:
+	}
+
+	conn.SetInitialized()
+
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+}
+
+func TestConnectionAbort(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+	task := kernel.TaskFromContext(s.Ctx)
+
+	const numRequests uint64 = 256
+
+	conn, _, err := newTestConnection(s, k, numRequests)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	var futNormal []*futureResponse
+
+	for i := 0; i < int(numRequests); i++ {
+		req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+		if err != nil {
+			t.Fatalf("NewRequest creation failed: %v", err)
+		}
+		fut, err := conn.callFutureLocked(task, req)
+		if err != nil {
+			t.Fatalf("callFutureLocked failed: %v", err)
+		}
+		futNormal = append(futNormal, fut)
+	}
+
+	conn.Abort(s.Ctx)
+
+	// Abort should unblock the initialization channel.
+	// Note: no test requests are actually blocked on `conn.initializedChan`.
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+
+	// Abort will return ECONNABORTED error to unblocked requests.
+	for _, fut := range futNormal {
+		if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+			t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+		}
+	}
+
+	// After abort, Call() should return directly with ENOTCONN.
+	req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+	_, err = conn.Call(task, req)
+	if err != syserror.ENOTCONN {
+		t.Fatalf("Incorrect error code received for Call() after connection aborted")
+	}
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index e522ff9a0..1b86a4b4c 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -32,6 +31,8 @@ import (
 const fuseDevMinor = 229
 
 // fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
 type fuseDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -50,15 +51,14 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
 type DeviceFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 	vfs.NoLockFD
 
-	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
-	mounted bool
-
 	// nextOpID is used to create new requests.
 	nextOpID linux.FUSEOpID
 
@@ -83,7 +83,7 @@ type DeviceFD struct {
 	writeCursorFR *futureResponse
 
 	// mu protects all the queues, maps, buffers and cursors and nextOpID.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// waitQueue is used to notify interested parties when the device becomes
 	// readable or writable.
@@ -92,21 +92,36 @@ type DeviceFD struct {
 	// fullQueueCh is a channel used to synchronize the readers with the writers.
 	// Writers (inbound requests to the filesystem) block if there are too many
 	// unprocessed in-flight requests.
-	fullQueueCh chan struct{}
+	fullQueueCh chan struct{} `state:".(int)"`
 
 	// fs is the FUSE filesystem that this FD is being used for.
 	fs *filesystem
 }
 
+func (fd *DeviceFD) saveFullQueueCh() int {
+	return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+	fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
-	fd.fs.conn.connected = false
+func (fd *DeviceFD) Release(ctx context.Context) {
+	if fd.fs != nil {
+		fd.fs.conn.mu.Lock()
+		fd.fs.conn.connected = false
+		fd.fs.conn.mu.Unlock()
+
+		fd.fs.VFSFilesystem().DecRef(ctx)
+		fd.fs = nil
+	}
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -116,10 +131,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		// TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+		return 0, syserror.ENODEV
+	}
+
 	// We require that any Read done on this filesystem have a sane minimum
 	// read buffer. It must have the capacity for the fixed parts of any request
 	// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -143,58 +164,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
 }
 
 // readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
 func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	if fd.queue.Empty() {
-		return 0, syserror.ErrWouldBlock
-	}
+	var req *Request
 
-	var readCursor uint32
-	var bytesRead int64
-	for {
-		req := fd.queue.Front()
-		if dst.NumBytes() < int64(req.hdr.Len) {
-			// The request is too large. Cannot process it. All requests must be smaller than the
-			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
-			// handshake.
-			errno := -int32(syscall.EIO)
-			if req.hdr.Opcode == linux.FUSE_SETXATTR {
-				errno = -int32(syscall.E2BIG)
-			}
+	// Find the first valid request.
+	// For the normal case this loop only execute once.
+	for !fd.queue.Empty() {
+		req = fd.queue.Front()
 
-			// Return the error to the calling task.
-			if err := fd.sendError(ctx, errno, req); err != nil {
-				return 0, err
-			}
+		if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+			break
+		}
 
-			// We're done with this request.
-			fd.queue.Remove(req)
+		// The request is too large. Cannot process it. All requests must be smaller than the
+		// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+		// handshake.
+		errno := -int32(syscall.EIO)
+		if req.hdr.Opcode == linux.FUSE_SETXATTR {
+			errno = -int32(syscall.E2BIG)
+		}
 
-			// Restart the read as this request was invalid.
-			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
-			return fd.readLocked(ctx, dst, opts)
+		// Return the error to the calling task.
+		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+			return 0, err
 		}
 
-		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		// We're done with this request.
+		fd.queue.Remove(req)
+		req = nil
+	}
+
+	if req == nil {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// We already checked the size: dst must be able to fit the whole request.
+	// Now we write the marshalled header, the payload,
+	// and the potential additional payload
+	// to the user memory IOSequence.
+
+	n, err := dst.CopyOut(ctx, req.data)
+	if err != nil {
+		return 0, err
+	}
+	if n != len(req.data) {
+		return 0, syserror.EIO
+	}
+
+	if req.hdr.Opcode == linux.FUSE_WRITE {
+		written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
 		if err != nil {
 			return 0, err
 		}
-		readCursor += uint32(n)
-		bytesRead += int64(n)
-
-		if readCursor >= req.hdr.Len {
-			// Fully done with this req, remove it from the queue.
-			fd.queue.Remove(req)
-			break
+		if written != len(req.payload) {
+			return 0, syserror.EIO
 		}
+		n += int(written)
 	}
 
-	return bytesRead, nil
+	// Fully done with this req, remove it from the queue.
+	fd.queue.Remove(req)
+
+	// Remove noReply ones from map of requests expecting a reply.
+	if req.noReply {
+		fd.numActiveRequests -= 1
+		delete(fd.completions, req.hdr.Unique)
+	}
+
+	return int64(n), nil
 }
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -211,10 +256,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
 // writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
 func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		return 0, syserror.ENODEV
+	}
+
 	var cn, n int64
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 
@@ -276,7 +326,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 			fut, ok := fd.completions[hdr.Unique]
 			if !ok {
-				// Server sent us a response for a request we never sent?
+				// Server sent us a response for a request we never sent,
+				// or for which we already received a reply (e.g. aborted), an unlikely event.
 				return 0, syserror.EINVAL
 			}
 
@@ -307,8 +358,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 // Readiness implements vfs.FileDescriptionImpl.Readiness.
 func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
 	var ready waiter.EventMask
-	ready |= waiter.EventOut // FD is always writable
+
+	if fd.fs.umounted {
+		ready |= waiter.EventErr
+		return ready & mask
+	}
+
+	// FD is always writable.
+	ready |= waiter.EventOut
 	if !fd.queue.Empty() {
 		// Have reqs available, FD is readable.
 		ready |= waiter.EventIn
@@ -330,7 +396,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -338,59 +404,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
 }
 
 // sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
 func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
-	// See if the running task need to perform some action before returning.
-	// Since we just finished writing the future, we can be sure that
-	// getResponse generates a populated response.
-	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
-		return err
-	}
+	// Signal the task waiting on a response if any.
+	defer close(fut.ch)
 
 	// Signal that the queue is no longer full.
 	select {
 	case fd.fullQueueCh <- struct{}{}:
 	default:
 	}
-	fd.numActiveRequests -= 1
+	fd.numActiveRequests--
+
+	if fut.async {
+		return fd.asyncCallBack(ctx, fut.getResponse())
+	}
 
-	// Signal the task waiting on a response.
-	close(fut.ch)
 	return nil
 }
 
-// sendError sends an error response to the waiting task (if any).
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
 	// Return the error to the calling task.
 	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	respHdr := linux.FUSEHeaderOut{
 		Len:    outHdrLen,
 		Error:  errno,
-		Unique: req.hdr.Unique,
+		Unique: unique,
 	}
 
 	fut, ok := fd.completions[respHdr.Unique]
 	if !ok {
-		// Server sent us a response for a request we never sent?
+		// A response for a request we never sent,
+		// or for which we already received a reply (e.g. aborted).
 		return syserror.EINVAL
 	}
 	delete(fd.completions, respHdr.Unique)
 
 	fut.hdr = &respHdr
-	if err := fd.sendResponse(ctx, fut); err != nil {
-		return err
-	}
-
-	return nil
+	return fd.sendResponse(ctx, fut)
 }
 
-// noReceiverAction has the calling kernel.Task do some action if its known that no
-// receiver is going to be waiting on the future channel. This is to be used by:
-// FUSE_INIT.
-func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
-	if r.opcode == linux.FUSE_INIT {
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+	switch r.opcode {
+	case linux.FUSE_INIT:
 		creds := auth.CredentialsFromContext(ctx)
 		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
 		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+		// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
 	}
 
 	return nil
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..5986133e9 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -16,7 +16,6 @@ package fuse
 
 import (
 	"fmt"
-	"io"
 	"math/rand"
 	"testing"
 
@@ -28,17 +27,12 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // echoTestOpcode is the Opcode used during testing. The server used in tests
 // will simply echo the payload back with the appropriate headers.
 const echoTestOpcode linux.FUSEOpcode = 1000
 
-type testPayload struct {
-	data uint32
-}
-
 // TestFUSECommunication tests that the communication layer between the Sentry and the
 // FUSE server daemon works as expected.
 func TestFUSECommunication(t *testing.T) {
@@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
 		}
 	}
 }
-
-func setup(t *testing.T) *testutil.System {
-	k, err := testutil.Boot()
-	if err != nil {
-		t.Fatalf("Error creating kernel: %v", err)
-	}
-
-	ctx := k.SupervisorContext()
-	creds := auth.CredentialsFromContext(ctx)
-
-	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserList:  true,
-		AllowUserMount: true,
-	})
-
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("NewMountNamespace(): %v", err)
-	}
-
-	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
-}
-
-// newTestConnection creates a fuse connection that the sentry can communicate with
-// and the FD for the server to communicate with.
-func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
-	vfsObj := &vfs.VirtualFilesystem{}
-	fuseDev := &DeviceFD{}
-
-	if err := vfsObj.Init(system.Ctx); err != nil {
-		return nil, nil, err
-	}
-
-	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
-	defer vd.DecRef(system.Ctx)
-	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, nil, err
-	}
-
-	fsopts := filesystemOptions{
-		maxActiveRequests: maxActiveRequests,
-	}
-	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return fs.conn, &fuseDev.vfsfd, nil
-}
-
-// SizeBytes implements marshal.Marshallable.SizeBytes.
-func (t *testPayload) SizeBytes() int {
-	return 4
-}
-
-// MarshalBytes implements marshal.Marshallable.MarshalBytes.
-func (t *testPayload) MarshalBytes(dst []byte) {
-	usermem.ByteOrder.PutUint32(dst[:4], t.data)
-}
-
-// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
-func (t *testPayload) UnmarshalBytes(src []byte) {
-	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
-}
-
-// Packed implements marshal.Marshallable.Packed.
-func (t *testPayload) Packed() bool {
-	return true
-}
-
-// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
-func (t *testPayload) MarshalUnsafe(dst []byte) {
-	t.MarshalBytes(dst)
-}
-
-// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
-func (t *testPayload) UnmarshalUnsafe(src []byte) {
-	t.UnmarshalBytes(src)
-}
-
-// CopyOutN implements marshal.Marshallable.CopyOutN.
-func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
-	panic("not implemented")
-}
-
-// CopyOut implements marshal.Marshallable.CopyOut.
-func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// CopyIn implements marshal.Marshallable.CopyIn.
-func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// WriteTo implements io.WriterTo.WriteTo.
-func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
-	panic("not implemented")
-}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+	fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+	fusefs := dir.inode().fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEReadIn{
+		Fh:     dir.Fh,
+		Offset: uint64(atomic.LoadInt64(&dir.off)),
+		Size:   linux.FUSE_PAGE_SIZE,
+		Flags:  dir.statusFlags(),
+	}
+
+	// TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := fusefs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEDirents
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	for _, fuseDirent := range out.Dirents {
+		nextOff := int64(fuseDirent.Meta.Off)
+		dirent := vfs.Dirent{
+			Name:    fuseDirent.Name,
+			Type:    uint8(fuseDirent.Meta.Type),
+			Ino:     fuseDirent.Meta.Ino,
+			NextOff: nextOff,
+		}
+
+		if err := callback.Handle(dirent); err != nil {
+			return err
+		}
+		atomic.StoreInt64(&dir.off, nextOff)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// the file handle used in userspace.
+	Fh uint64
+
+	// Nonseekable is indicate cannot perform seek on a file.
+	Nonseekable bool
+
+	// DirectIO suggest fuse to use direct io operation.
+	DirectIO bool
+
+	// OpenFlag is the flag returned by open.
+	OpenFlag uint32
+
+	// off is the file offset.
+	off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+	return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+	return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+	// no need to release if FUSE server doesn't implement Open.
+	conn := fd.inode().fs.conn
+	if conn.noOpen {
+		return
+	}
+
+	in := linux.FUSEReleaseIn{
+		Fh:    fd.Fh,
+		Flags: fd.statusFlags(),
+	}
+	// TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+	var opcode linux.FUSEOpcode
+	if fd.inode().Mode().IsDir() {
+		opcode = linux.FUSE_RELEASEDIR
+	} else {
+		opcode = linux.FUSE_RELEASE
+	}
+	kernelTask := kernel.TaskFromContext(ctx)
+	// ignoring errors and FUSE server reply is analogous to Linux's behavior.
+	req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+	if err != nil {
+		// No way to invoke Call() with an errored request.
+		return
+	}
+	// The reply will be ignored since no callback is defined in asyncCallBack().
+	conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	fs := fd.filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 83c24ec25..e7ef5998e 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,24 +16,36 @@
 package fuse
 
 import (
+	"math"
 	"strconv"
+	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // Name is the default filesystem name.
 const Name = "fuse"
 
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
+// +stateify savable
 type filesystemOptions struct {
 	// userID specifies the numeric uid of the mount owner.
 	// This option should not be specified by the filesystem owner.
@@ -56,9 +68,16 @@ type filesystemOptions struct {
 	// exist at any time. Any further requests will block when trying to
 	// Call the server.
 	maxActiveRequests uint64
+
+	// maxRead is the max number of bytes to read,
+	// specified as "max_read" in fs parameters.
+	// If not specified by user, use math.MaxUint32 as default value.
+	maxRead uint32
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 	devMinor uint32
@@ -69,6 +88,9 @@ type filesystem struct {
 
 	// opts is the options the fusefs is initialized with.
 	opts *filesystemOptions
+
+	// umounted is true if filesystem.Release() has been called.
+	umounted bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -76,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -142,14 +167,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	// Set the maxInFlightRequests option.
 	fsopts.maxActiveRequests = maxActiveRequestsDefault
 
+	if maxReadStr, ok := mopts["max_read"]; ok {
+		delete(mopts, "max_read")
+		maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+			return nil, nil, syserror.EINVAL
+		}
+		if maxRead < fuseMinMaxRead {
+			maxRead = fuseMinMaxRead
+		}
+		fsopts.maxRead = uint32(maxRead)
+	} else {
+		fsopts.maxRead = math.MaxUint32
+	}
+
 	// Check for unparsed options.
 	if len(mopts) != 0 {
-		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
 		return nil, nil, syserror.EINVAL
 	}
 
 	// Create a new FUSE filesystem.
-	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
 	if err != nil {
 		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
 		return nil, nil, err
@@ -165,26 +205,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 
 	// root is the fusefs root directory.
-	root := fs.newInode(creds, fsopts.rootMode)
+	root := fs.newRootInode(ctx, creds, fsopts.rootMode)
 
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
 
-// NewFUSEFilesystem creates a new FUSE filesystem.
-func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
-	fs := &filesystem{
-		devMinor: devMinor,
-		opts:     opts,
-	}
-
-	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	conn, err := newFUSEConnection(ctx, device, opts)
 	if err != nil {
 		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
 		return nil, syserror.EINVAL
 	}
 
-	fs.conn = conn
 	fuseFD := device.Impl().(*DeviceFD)
+
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+		conn:     conn,
+	}
+
+	fs.VFSFilesystem().IncRef()
 	fuseFD.fs = fs
 
 	return fs, nil
@@ -192,39 +234,361 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
 
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
+	fs.conn.fd.mu.Lock()
+
+	fs.umounted = true
+	fs.conn.Abort(ctx)
+	// Notify all the waiters on this fd.
+	fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	fs.conn.fd.mu.Unlock()
+
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 	fs.Filesystem.Release(ctx)
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
+	inodeRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
-	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	// the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// metaDataMu protects the metadata of this inode.
+	metadataMu sync.Mutex
+
+	nodeID uint64
+
 	locks vfs.FileLocks
 
-	dentry kernfs.Dentry
+	// size of the file.
+	size uint64
+
+	// attributeVersion is the version of inode's attributes.
+	attributeVersion uint64
+
+	// attributeTime is the remaining vaild time of attributes.
+	attributeTime uint64
+
+	// version of the inode.
+	version uint64
+
+	// link is result of following a symbolic link.
+	link string
 }
 
-func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
-	i := &inode{}
-	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{fs: fs, nodeID: 1}
+	i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
 	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	i.dentry.Init(i)
+	i.EnableLeakCheck()
 
-	return &i.dentry
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, i)
+	return &d
+}
+
+func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
+	i := &inode{fs: fs, nodeID: nodeID}
+	creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+	i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+	atomic.StoreUint64(&i.size, attr.Size)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.EnableLeakCheck()
+	return i
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	isDir := i.InodeAttrs.Mode().IsDir()
+	// return error if specified to open directory but inode is not a directory.
+	if !isDir && opts.Mode.IsDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+		return nil, syserror.EOVERFLOW
+	}
+
+	var fd *fileDescription
+	var fdImpl vfs.FileDescriptionImpl
+	if isDir {
+		directoryFD := &directoryFD{}
+		fd = &(directoryFD.fileDescription)
+		fdImpl = directoryFD
+	} else {
+		regularFD := &regularFileFD{}
+		fd = &(regularFD.fileDescription)
+		fdImpl = regularFD
+	}
+	// FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+	fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+	// Only send open request when FUSE server support open or is opening a directory.
+	if !i.fs.conn.noOpen || isDir {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+			return nil, syserror.EINVAL
+		}
+
+		// Build the request.
+		var opcode linux.FUSEOpcode
+		if isDir {
+			opcode = linux.FUSE_OPENDIR
+		} else {
+			opcode = linux.FUSE_OPEN
+		}
+
+		in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+		if !i.fs.conn.atomicOTrunc {
+			in.Flags &= ^uint32(linux.O_TRUNC)
+		}
+
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+		if err != nil {
+			return nil, err
+		}
+
+		// Send the request and receive the reply.
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return nil, err
+		}
+		if err := res.Error(); err == syserror.ENOSYS && !isDir {
+			i.fs.conn.noOpen = true
+		} else if err != nil {
+			return nil, err
+		} else {
+			out := linux.FUSEOpenOut{}
+			if err := res.UnmarshalPayload(&out); err != nil {
+				return nil, err
+			}
+
+			// Process the reply.
+			fd.OpenFlag = out.OpenFlag
+			if isDir {
+				fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+			}
+
+			fd.Fh = out.Fh
+		}
+	}
+
+	// TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+	fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+	fdOptions := &vfs.FileDescriptionOptions{}
+	if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+		fdOptions.DenyPRead = true
+		fdOptions.DenyPWrite = true
+		fd.Nonseekable = true
+	}
+
+	// If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+	// and O_TRUNC is set, update the inode's version number and clean existing data
+	// by setting the file size to 0.
+	if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+		i.fs.conn.mu.Lock()
+		i.fs.conn.attributeVersion++
+		i.attributeVersion = i.fs.conn.attributeVersion
+		atomic.StoreUint64(&i.size, 0)
+		i.fs.conn.mu.Unlock()
+		i.attributeTime = 0
+	}
+
+	if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	in := linux.FUSELookupIn{Name: name}
+	return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (i *inode) Keep() bool {
+	// Return true so that kernfs keeps the new dentry pointing to this
+	// inode in the dentry tree. This is needed because inodes created via
+	// Lookup are not temporary. They might refer to existing files on server
+	// that can be Unlink'd/Rmdir'd.
+	return true
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	in := linux.FUSECreateIn{
+		CreateMeta: linux.FUSECreateMeta{
+			Flags: opts.Flags,
+			Mode:  uint32(opts.Mode) | linux.S_IFREG,
+			Umask: uint32(kernelTask.FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMknodIn{
+		MknodMeta: linux.FUSEMknodMeta{
+			Mode:  uint32(opts.Mode),
+			Rdev:  linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
+	in := linux.FUSESymLinkIn{
+		Name:   name,
+		Target: target,
+	}
+	return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return syserror.EINVAL
+	}
+	in := linux.FUSEUnlinkIn{Name: name}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
 	if err != nil {
+		return err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return err
+	}
+	// only return error, discard res.
+	return res.Error()
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMkdirIn{
+		MkdirMeta: linux.FUSEMkdirMeta{
+			Mode:  uint32(opts.Mode),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+	fusefs := i.fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSERmDirIn{Name: name}
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := i.fs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	return res.Error()
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+	if err != nil {
+		return nil, err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return nil, err
+	}
+	if err := res.Error(); err != nil {
+		return nil, err
+	}
+	out := linux.FUSEEntryOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
 		return nil, err
 	}
-	return fd.VFSFileDescription(), nil
+	if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+		return nil, syserror.EIO
+	}
+	child := i.fs.newInode(ctx, out.NodeID, out.Attr)
+	return child, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	path, err := i.Readlink(ctx, mnt)
+	return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if i.Mode().FileType()&linux.S_IFLNK == 0 {
+		return "", syserror.EINVAL
+	}
+	if len(i.link) == 0 {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+			return "", syserror.EINVAL
+		}
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+		if err != nil {
+			return "", err
+		}
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return "", err
+		}
+		i.link = string(res.data[res.hdr.SizeBytes():])
+		if !mnt.Options().ReadOnly {
+			i.attributeTime = 0
+		}
+	}
+	return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+	return linux.FUSEAttr{
+		Ino:  i.Ino(),
+		Size: atomic.LoadUint64(&i.size),
+		Mode: uint32(i.Mode()),
+	}
 }
 
 // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -280,45 +644,179 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
 	return stat
 }
 
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	fusefs := fs.Impl().(*filesystem)
-	conn := fusefs.conn
-	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+	attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+	// TODO(gvisor.dev/issue/3679): send the request only if
+	// - invalid local cache for fields specified in the opts.Mask
+	// - forced update
+	// - i.attributeTime expired
+	// If local cache is still valid, return local cache.
+	// Currently we always send a request,
+	// and we always set the metadata with the new result,
+	// unless attributeVersion has changed.
+
+	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		log.Warningf("couldn't get kernel task from context")
-		return linux.Statx{}, syserror.EINVAL
+		return linux.FUSEAttr{}, syserror.EINVAL
 	}
 
-	var in linux.FUSEGetAttrIn
-	// We don't set any attribute in the request, because in VFS2 fstat(2) will
-	// finally be translated into vfs.FilesystemImpl.StatAt() (see
-	// pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
-	// as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
-	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+	creds := auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEGetAttrIn{
+		GetAttrFlags: flags,
+		Fh:           fh,
+	}
+	req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
-	res, err := conn.Call(task, req)
+	res, err := i.fs.conn.Call(task, req)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 	if err := res.Error(); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
 	var out linux.FUSEGetAttrOut
 	if err := res.UnmarshalPayload(&out); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
+	}
+
+	// Local version is newer, return the local one.
+	// Skip the update.
+	if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+		return i.getFUSEAttr(), nil
 	}
 
-	// Set all metadata into kernfs.InodeAttrs.
-	if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
-		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
 	}); err != nil {
+		return linux.FUSEAttr{}, err
+	}
+
+	// Set the size if no error (after SetStat() check).
+	atomic.StoreUint64(&i.size, out.Attr.Size)
+
+	return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+	// Never need atime for internal purposes.
+	_, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+	}, flags, fh)
+	return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+	if err != nil {
 		return linux.Statx{}, err
 	}
 
-	return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+	return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+	i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+	return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+	var fuseAttrMask uint32
+	maskMap := map[uint32]uint32{
+		linux.STATX_MODE:  linux.FATTR_MODE,
+		linux.STATX_UID:   linux.FATTR_UID,
+		linux.STATX_GID:   linux.FATTR_GID,
+		linux.STATX_SIZE:  linux.FATTR_SIZE,
+		linux.STATX_ATIME: linux.FATTR_ATIME,
+		linux.STATX_MTIME: linux.FATTR_MTIME,
+		linux.STATX_CTIME: linux.FATTR_CTIME,
+	}
+	for statxMask, fattrMask := range maskMap {
+		if mask&statxMask != 0 {
+			fuseAttrMask |= fattrMask
+		}
+	}
+	return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+	conn := i.fs.conn
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		log.Warningf("couldn't get kernel task from context")
+		return syserror.EINVAL
+	}
+
+	// We should retain the original file type when assigning new mode.
+	fileType := uint16(i.Mode()) & linux.S_IFMT
+	fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+	if useFh {
+		fattrMask |= linux.FATTR_FH
+	}
+	in := linux.FUSESetAttrIn{
+		Valid:     fattrMask,
+		Fh:        fh,
+		Size:      opts.Stat.Size,
+		Atime:     uint64(opts.Stat.Atime.Sec),
+		Mtime:     uint64(opts.Stat.Mtime.Sec),
+		Ctime:     uint64(opts.Stat.Ctime.Sec),
+		AtimeNsec: opts.Stat.Atime.Nsec,
+		MtimeNsec: opts.Stat.Mtime.Nsec,
+		CtimeNsec: opts.Stat.Ctime.Nsec,
+		Mode:      uint32(fileType | opts.Stat.Mode),
+		UID:       opts.Stat.UID,
+		GID:       opts.Stat.GID,
+	}
+	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+	out := linux.FUSEGetAttrOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+	}); err != nil {
+		return err
+	}
+
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..2d396e84c
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,244 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+	attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return nil, 0, syserror.EINVAL
+	}
+
+	// Round up to a multiple of page size.
+	readSize, _ := usermem.PageRoundUp(uint64(size))
+
+	// One request cannnot exceed either maxRead or maxPages.
+	maxPages := fs.conn.maxRead >> usermem.PageShift
+	if maxPages > uint32(fs.conn.maxPages) {
+		maxPages = uint32(fs.conn.maxPages)
+	}
+
+	var outs [][]byte
+	var sizeRead uint32
+
+	// readSize is a multiple of usermem.PageSize.
+	// Always request bytes as a multiple of pages.
+	pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEReadIn{
+		Fh:        fd.Fh,
+		LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+		ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		Flags:     fd.statusFlags(),
+	}
+
+	// This loop is intended for fragmented read where the bytes to read is
+	// larger than either the maxPages or maxRead.
+	// For the majority of reads with normal size, this loop should only
+	// execute once.
+	for pagesRead < pagesToRead {
+		pagesCanRead := pagesToRead - pagesRead
+		if pagesCanRead > maxPages {
+			pagesCanRead = maxPages
+		}
+
+		in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+		in.Size = pagesCanRead << usermem.PageShift
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// TODO(gvisor.dev/issue/3247): support async read.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return nil, 0, err
+		}
+		if err := res.Error(); err != nil {
+			return nil, 0, err
+		}
+
+		// Not enough bytes in response,
+		// either we reached EOF,
+		// or the FUSE server sends back a response
+		// that cannot even fit the hdr.
+		if len(res.data) <= res.hdr.SizeBytes() {
+			// We treat both case as EOF here for now
+			// since there is no reliable way to detect
+			// the over-short hdr case.
+			break
+		}
+
+		// Directly using the slice to avoid extra copy.
+		out := res.data[res.hdr.SizeBytes():]
+
+		outs = append(outs, out)
+		sizeRead += uint32(len(out))
+
+		pagesRead += pagesCanRead
+	}
+
+	defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+	// No bytes returned: offset >= EOF.
+	if len(outs) == 0 {
+		return nil, 0, io.EOF
+	}
+
+	return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+	// TODO(gvisor.dev/issue/3247): support async read.
+	// If this is called by an async read, correctly process it.
+	// May need to update the signature.
+
+	i := fd.inode()
+	i.InodeAttrs.TouchAtime(ctx, fd.vfsfd.Mount())
+
+	// Reached EOF.
+	if sizeRead < size {
+		// TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+		// Might need to update the buf to be returned from the Read().
+
+		// Update existing size.
+		newSize := off + uint64(sizeRead)
+		fs.conn.mu.Lock()
+		if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+			fs.conn.attributeVersion++
+			i.attributeVersion = i.fs.conn.attributeVersion
+			atomic.StoreUint64(&i.size, newSize)
+		}
+		fs.conn.mu.Unlock()
+	}
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return 0, syserror.EINVAL
+	}
+
+	// One request cannnot exceed either maxWrite or maxPages.
+	maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+	if maxWrite > fs.conn.maxWrite {
+		maxWrite = fs.conn.maxWrite
+	}
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEWriteIn{
+		Fh: fd.Fh,
+		// TODO(gvisor.dev/issue/3245): file lock
+		LockOwner: 0,
+		// TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		// TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+		WriteFlags: 0,
+		Flags:      fd.statusFlags(),
+	}
+
+	inode := fd.inode()
+	var written uint32
+
+	// This loop is intended for fragmented write where the bytes to write is
+	// larger than either the maxWrite or maxPages or when bigWrites is false.
+	// Unless a small value for max_write is explicitly used, this loop
+	// is expected to execute only once for the majority of the writes.
+	for written < size {
+		toWrite := size - written
+
+		// Limit the write size to one page.
+		// Note that the bigWrites flag is obsolete,
+		// latest libfuse always sets it on.
+		if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+			toWrite = usermem.PageSize
+		}
+
+		// Limit the write size to maxWrite.
+		if toWrite > maxWrite {
+			toWrite = maxWrite
+		}
+
+		in.Offset = off + uint64(written)
+		in.Size = toWrite
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in)
+		if err != nil {
+			return 0, err
+		}
+
+		req.payload = data[written : written+toWrite]
+
+		// TODO(gvisor.dev/issue/3247): support async write.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return 0, err
+		}
+		if err := res.Error(); err != nil {
+			return 0, err
+		}
+
+		out := linux.FUSEWriteOut{}
+		if err := res.UnmarshalPayload(&out); err != nil {
+			return 0, err
+		}
+
+		// Write more than requested? EIO.
+		if out.Size > toWrite {
+			return 0, syserror.EIO
+		}
+
+		written += out.Size
+
+		// Break if short write. Not necessarily an error.
+		if out.Size != toWrite {
+			break
+		}
+	}
+	inode.InodeAttrs.TouchCMtime(ctx)
+
+	return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset.
+	off int64
+	// offMu protects off.
+	offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	size := dst.NumBytes()
+	if size == 0 {
+		// Early return if count is 0.
+		return 0, nil
+	} else if size > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
+
+	// TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+	inode := fd.inode()
+
+	// Reading beyond EOF, update file size if outdated.
+	if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+		if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+			return 0, err
+		}
+		// If the offset after update is still too large, return error.
+		if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+			return 0, io.EOF
+		}
+	}
+
+	// Truncate the read with updated file size.
+	fileSize := atomic.LoadUint64(&inode.size)
+	if uint64(offset+size) > fileSize {
+		size = int64(fileSize) - offset
+	}
+
+	buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+	// store the bytes that were read ahead.
+
+	// Update the number of bytes to copy for short read.
+	if n < uint32(size) {
+		size = int64(n)
+	}
+
+	// Copy the bytes read to the dst.
+	// This loop is intended for fragmented reads.
+	// For the majority of reads, this loop only execute once.
+	var copied int64
+	for _, buffer := range buffers {
+		toCopy := int64(len(buffer))
+		if copied+toCopy > size {
+			toCopy = size - copied
+		}
+		cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+		if err != nil {
+			return 0, err
+		}
+		if int64(cp) != toCopy {
+			return 0, syserror.EIO
+		}
+		copied += toCopy
+	}
+
+	return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	inode := fd.inode()
+	inode.metadataMu.Lock()
+	defer inode.metadataMu.Unlock()
+
+	// If the file is opened with O_APPEND, update offset to file size.
+	// Note: since our Open() implements the interface of kernfs,
+	// and kernfs currently does not support O_APPEND, this will never
+	// be true before we switch out from kernfs.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking inode.metadataMu is sufficient for reading size
+		offset = int64(inode.size)
+	}
+
+	srclen := src.NumBytes()
+
+	if srclen > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+	if end := offset + srclen; end < offset {
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if srclen == 0 {
+		// Return before causing any side effects.
+		return 0, offset, nil
+	}
+
+	src = src.TakeFirst64(srclen)
+
+	// TODO(gvisor.dev/issue/3237): Add cache support:
+	// buffer cache. Ideally we write from src to our buffer cache first.
+	// The slice passed to fs.Write() should be a slice from buffer cache.
+	data := make([]byte, srclen)
+	// Reason for making a copy here: connection.Call() blocks on kerneltask,
+	// which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+	// attemp to acquire the mm.activeMu lock as well -> deadlock.
+	// We must finish reading from the userspace memory before
+	// t.Block() deactivates it.
+	cp, err := src.CopyIn(ctx, data)
+	if err != nil {
+		return 0, offset, err
+	}
+	if int64(cp) != srclen {
+		return 0, offset, syserror.EIO
+	}
+
+	n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if n == 0 {
+		// We have checked srclen != 0 previously.
+		// If err == nil, then it's a short write and we return EIO.
+		return 0, offset, syserror.EIO
+	}
+
+	written = int64(n)
+	finalOff = offset + written
+
+	if finalOff > int64(inode.size) {
+		atomic.StoreUint64(&inode.size, uint64(finalOff))
+		atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+	}
+
+	return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+	marshal.StubMarshallable
+
+	// initOut contains the response from the FUSE server.
+	initOut linux.FUSEInitOut
+
+	// initLen is the total length of bytes of the response.
+	initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+	out := &r.initOut
+
+	// Introduced before FUSE kernel version 7.13.
+	out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+
+	// Introduced in FUSE kernel version 7.23.
+	if len(src) >= 4 {
+		out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+		src = src[4:]
+	}
+	// Introduced in FUSE kernel version 7.28.
+	if len(src) >= 2 {
+		out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+		src = src[2:]
+	}
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+	return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+
+	// payload for this request: extra bytes to write after
+	// the data slice. Used by FUSE_WRITE.
+	payload []byte
+
+	// If this request is async.
+	async bool
+	// If we don't care its response.
+	// Manually set by the caller.
+	noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	hdr.MarshalBytes(buf[:hdrLen])
+	payload.MarshalBytes(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+
+	// If this request is async.
+	async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+	return &futureResponse{
+		opcode: req.hdr.Opcode,
+		ch:     make(chan struct{}),
+		async:  req.async,
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// Return directly for async requests.
+	if f.async {
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+	return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	// The response data is empty unless there is some payload. And so, doesn't
+	// need to be unmarshalled.
+	if r.data == nil {
+		return nil
+	}
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	m.UnmarshalBytes(r.data[hdrLen:])
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(system.Ctx); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef(system.Ctx)
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+	marshal.StubMarshallable
+	data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 16787116f..4c3e9acf8 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -38,6 +38,7 @@ go_library(
         "host_named_pipe.go",
         "p9file.go",
         "regular_file.go",
+        "save_restore.go",
         "socket.go",
         "special_file.go",
         "symlink.go",
@@ -52,6 +53,8 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
@@ -69,6 +72,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 2a8011eb4..e993c8e36 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -16,16 +16,17 @@ package gofer
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -34,8 +35,11 @@ func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
 func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.IncRef() // reference held by child on its parent
 	child.parent = d
@@ -46,7 +50,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.children[name] = child
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) cacheNegativeLookupLocked(name string) {
 	// Don't cache negative lookups if InteropModeShared is in effect (since
 	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,13 +85,15 @@ type createSyntheticOpts struct {
 // createSyntheticChildLocked creates a synthetic file with the given name
 // in d.
 //
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
 func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
-	d2 := &dentry{
+	child := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
-		ino:       d.fs.nextSyntheticIno(),
+		ino:       d.fs.nextIno(),
 		mode:      uint32(opts.mode),
 		uid:       uint32(opts.kuid),
 		gid:       uint32(opts.kgid),
@@ -93,28 +101,32 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 		hostFD:    -1,
 		nlink:     uint32(2),
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(child, "gofer.dentry")
+	}
 	switch opts.mode.FileType() {
 	case linux.S_IFDIR:
 		// Nothing else needs to be done.
 	case linux.S_IFSOCK:
-		d2.endpoint = opts.endpoint
+		child.endpoint = opts.endpoint
 	case linux.S_IFIFO:
-		d2.pipe = opts.pipe
+		child.pipe = opts.pipe
 	default:
 		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
 	}
-	d2.pf.dentry = d2
-	d2.vfsd.Init(d2)
+	child.pf.dentry = child
+	child.vfsd.Init(child)
 
-	d.cacheNewChildLocked(d2, opts.name)
+	d.cacheNewChildLocked(child, opts.name)
 	d.syntheticChildren++
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -151,7 +163,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	return nil
 }
 
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
 	// presence of concurrent mutation of an iterated directory, so
@@ -225,7 +239,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 				}
 				dirent := vfs.Dirent{
 					Name:    p9d.Name,
-					Ino:     uint64(inoFromPath(p9d.QID.Path)),
+					Ino:     d.fs.inoFromQIDPath(p9d.QID.Path),
 					NextOff: int64(len(dirents) + 1),
 				}
 				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 9a90351e5..baecb88c4 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -35,7 +35,7 @@ import (
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
-	// Snapshot current syncable dentries and special files.
+	// Snapshot current syncable dentries and special file FDs.
 	fs.syncMu.Lock()
 	ds := make([]*dentry, 0, len(fs.syncableDentries))
 	for d := range fs.syncableDentries {
@@ -53,22 +53,28 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 	// regardless.
 	var retErr error
 
-	// Sync regular files.
+	// Sync syncable dentries.
 	for _, d := range ds {
-		err := d.syncCachedFile(ctx)
+		err := d.syncCachedFile(ctx, true /* forFilesystemSync */)
 		d.DecRef(ctx)
-		if err != nil && retErr == nil {
-			retErr = err
+		if err != nil {
+			ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
+			if retErr == nil {
+				retErr = err
+			}
 		}
 	}
 
 	// Sync special files, which may be writable but do not use dentry shared
 	// handles (so they won't be synced by the above).
 	for _, sffd := range sffds {
-		err := sffd.Sync(ctx)
+		err := sffd.sync(ctx, true /* forFilesystemSync */)
 		sffd.vfsfd.DecRef(ctx)
-		if err != nil && retErr == nil {
-			retErr = err
+		if err != nil {
+			ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
+			if retErr == nil {
+				retErr = err
+			}
 		}
 	}
 
@@ -115,9 +121,12 @@ func putDentrySlice(ds *[]*dentry) {
 // Dentries which may become cached as a result of the traversal are appended
 // to *ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
@@ -185,8 +194,11 @@ afterSymlink:
 // getChildLocked returns a dentry representing the child of parent with the
 // given name. If no such child exists, getChildLocked returns (nil, nil).
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: If getChildLocked returns a non-nil dentry, its cached
 // metadata is up to date.
@@ -206,7 +218,8 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
 }
 
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	if child != nil {
 		// Need to lock child.metadataMu because we might be updating child
@@ -222,7 +235,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		return nil, err
 	}
 	if child != nil {
-		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+		if !file.isNil() && qid.Path == child.qidPath {
 			// The file at this path hasn't changed. Just update cached metadata.
 			file.close(ctx)
 			child.updateFromP9AttrsLocked(attrMask, &attr)
@@ -279,9 +292,11 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -328,9 +343,10 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // createInRemoteDir (if the parent directory is a real remote directory) or
 // createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -399,7 +415,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time it's
 		// used.
-		if err := createInRemoteDir(parent, name); err != nil {
+		if err := createInRemoteDir(parent, name, &ds); err != nil {
 			return err
 		}
 		ev := linux.IN_CREATE
@@ -414,7 +430,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := createInRemoteDir(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name, &ds); err != nil {
 		return err
 	}
 	if child, ok := parent.children[name]; ok && child == nil {
@@ -721,7 +737,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -754,7 +770,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	creds := rp.Credentials()
-	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
 			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
 				return err
@@ -789,34 +805,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		// If the gofer does not allow creating a socket or pipe, create a
-		// synthetic one, i.e. one that is kept entirely in memory.
-		if err == syserror.EPERM {
-			switch opts.Mode.FileType() {
-			case linux.S_IFSOCK:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name:     name,
-					mode:     opts.Mode,
-					kuid:     creds.EffectiveKUID,
-					kgid:     creds.EffectiveKGID,
-					endpoint: opts.Endpoint,
-				})
-				return nil
-			case linux.S_IFIFO:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name: name,
-					mode: opts.Mode,
-					kuid: creds.EffectiveKUID,
-					kgid: creds.EffectiveKGID,
-					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
-				})
-				return nil
-			}
+		if err != syserror.EPERM {
+			return err
 		}
-		return err
+
+		// EPERM means that gofer does not allow creating a socket or pipe. Fallback
+		// to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+		// Check that we're not overriding an existing file with a synthetic one.
+		_, err = fs.stepLocked(ctx, rp, parent, true, ds)
+		switch {
+		case err == nil:
+			// Step succeeded, another file exists.
+			return syserror.EEXIST
+		case err != syserror.ENOENT:
+			// Unexpected error.
+			return err
+		}
+
+		switch opts.Mode.FileType() {
+		case linux.S_IFSOCK:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name:     name,
+				mode:     opts.Mode,
+				kuid:     creds.EffectiveKUID,
+				kgid:     creds.EffectiveKGID,
+				endpoint: opts.Endpoint,
+			})
+			return nil
+		case linux.S_IFIFO:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+				pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+			})
+			return nil
+		}
+		// Retain error from gofer if synthetic file cannot be created internally.
+		return syserror.EPERM
 	}, nil)
 }
 
@@ -1001,7 +1032,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// step is required even if !d.cachedMetadataAuthoritative() because
 		// d.mappings has to be updated.
 		// d.metadataMu has already been acquired if trunc == true.
-		d.updateFileSizeLocked(0)
+		d.updateSizeLocked(0)
 
 		if d.cachedMetadataAuthoritative() {
 			d.touchCMtimeLocked()
@@ -1072,8 +1103,10 @@ retry:
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
@@ -1284,6 +1317,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			if !renamed.isDir() {
 				return syserror.EISDIR
 			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
 		} else {
 			if rp.MustBeDir() || renamed.isDir() {
 				return syserror.ENOTDIR
@@ -1334,14 +1370,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// with reference counts and queue oldParent for checkCachingLocked if the
 	// parent isn't actually changing.
 	if oldParent != newParent {
+		oldParent.decRefLocked()
 		ds = appendDentry(ds, oldParent)
 		newParent.IncRef()
 		if renamed.isSynthetic() {
 			oldParent.syntheticChildren--
 			newParent.syntheticChildren++
 		}
+		renamed.parent = newParent
 	}
-	renamed.parent = newParent
 	renamed.name = newName
 	if newParent.children == nil {
 		newParent.children = make(map[string]*dentry)
@@ -1385,11 +1422,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -1452,7 +1489,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
@@ -1464,7 +1501,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return fs.unlinkAt(ctx, rp, false /* dir */)
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -1481,17 +1518,18 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 			d.IncRef()
 			return &endpoint{
 				dentry: d,
-				file:   d.file.file,
 				path:   opts.Addr,
 			}, nil
 		}
-		return d.endpoint, nil
+		if d.endpoint != nil {
+			return d.endpoint, nil
+		}
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1499,11 +1537,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx, rp.Credentials(), size)
+	return d.listXattr(ctx, rp.Credentials(), size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1511,11 +1549,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, rp.Credentials(), &opts)
+	return d.getXattr(ctx, rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1523,18 +1561,18 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setXattr(ctx, rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1542,11 +1580,11 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.removeXattr(ctx, rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
@@ -1558,7 +1596,3 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.renameMu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
-
-func (fs *filesystem) nextSyntheticIno() inodeNumber {
-	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
-}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 63e589859..80668ebc1 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -26,6 +26,9 @@
 //             *** "memmap.Mappable locks taken by Translate" below this point
 //             dentry.handleMu
 //               dentry.dataMu
+//           filesystem.inoMu
+//   specialFileFD.mu
+//     specialFileFD.bufMu
 //
 // Locking dentry.dirMu in multiple dentries requires that either ancestor
 // dentries are locked before descendant dentries, or that filesystem.renameMu
@@ -36,7 +39,6 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -44,6 +46,8 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
+	refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -53,6 +57,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -62,9 +67,13 @@ import (
 const Name = "9p"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -77,7 +86,7 @@ type filesystem struct {
 	iopts InternalFilesystemOptions
 
 	// client is the client used by this filesystem. client is immutable.
-	client *p9.Client
+	client *p9.Client `state:"nosave"`
 
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock ktime.Clock
@@ -85,6 +94,9 @@ type filesystem struct {
 	// devMinor is the filesystem's minor device number. devMinor is immutable.
 	devMinor uint32
 
+	// root is the root dentry. root is immutable.
+	root *dentry
+
 	// renameMu serves two purposes:
 	//
 	// - It synchronizes path resolution with renaming initiated by this
@@ -95,43 +107,42 @@ type filesystem struct {
 	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
 	// is reachable from its children), or if it is a child dentry (such that
 	// it is reachable from its parent).
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// cachedDentries contains all dentries with 0 references. (Due to race
 	// conditions, it may also contain dentries with non-zero references.)
-	// cachedDentriesLen is the number of dentries in cachedDentries. These
-	// fields are protected by renameMu.
+	// cachedDentriesLen is the number of dentries in cachedDentries. These fields
+	// are protected by renameMu.
 	cachedDentries    dentryList
 	cachedDentriesLen uint64
 
-	// syncableDentries contains all dentries in this filesystem for which
-	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
-	// These fields are protected by syncMu.
-	syncMu           sync.Mutex
+	// syncableDentries contains all non-synthetic dentries. specialFileFDs
+	// contains all open specialFileFDs. These fields are protected by syncMu.
+	syncMu           sync.Mutex `state:"nosave"`
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
 
-	// syntheticSeq stores a counter to used to generate unique inodeNumber for
-	// synthetic dentries.
-	syntheticSeq uint64
-}
+	// inoByQIDPath maps previously-observed QID.Paths to inode numbers
+	// assigned to those paths. inoByQIDPath is not preserved across
+	// checkpoint/restore because QIDs may be reused between different gofer
+	// processes, so QIDs may be repeated for different files across
+	// checkpoint/restore. inoByQIDPath is protected by inoMu.
+	inoMu        sync.Mutex        `state:"nosave"`
+	inoByQIDPath map[uint64]uint64 `state:"nosave"`
 
-// inodeNumber represents inode number reported in Dirent.Ino. For regular
-// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
-// have have their inodeNumber generated sequentially, with the MSB reserved to
-// prevent conflicts with regular dentries.
-type inodeNumber uint64
+	// lastIno is the last inode number assigned to a file. lastIno is accessed
+	// using atomic memory operations.
+	lastIno uint64
 
-// Reserve MSB for synthetic mounts.
-const syntheticInoMask = uint64(1) << 63
+	// savedDentryRW records open read/write handles during save/restore.
+	savedDentryRW map[*dentry]savedDentryRW
 
-func inoFromPath(path uint64) inodeNumber {
-	if path&syntheticInoMask != 0 {
-		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
-	}
-	return inodeNumber(path &^ syntheticInoMask)
+	// released is nonzero once filesystem.Release has been called. It is accessed
+	// with atomic memory operations.
+	released int32
 }
 
+// +stateify savable
 type filesystemOptions struct {
 	// "Standard" 9P options.
 	fd      int
@@ -142,8 +153,7 @@ type filesystemOptions struct {
 	msize   uint32
 	version string
 
-	// maxCachedDentries is the maximum number of dentries with 0 references
-	// retained by the client.
+	// maxCachedDentries is the maximum size of filesystem.cachedDentries.
 	maxCachedDentries uint64
 
 	// If forcePageCache is true, host FDs may not be used for application
@@ -177,6 +187,8 @@ type filesystemOptions struct {
 
 // InteropMode controls the client's interaction with other remote filesystem
 // users.
+//
+// +stateify savable
 type InteropMode uint32
 
 const (
@@ -195,11 +207,7 @@ const (
 	// and consistent with Linux's semantics (in particular, it is not always
 	// possible for clients to set arbitrary atimes and mtimes depending on the
 	// remote filesystem implementation, and never possible for clients to set
-	// arbitrary ctimes.) If a dentry containing a client-defined atime or
-	// mtime is evicted from cache, client timestamps will be sent to the
-	// remote filesystem on a best-effort basis to attempt to ensure that
-	// timestamps will be preserved when another dentry representing the same
-	// file is instantiated.
+	// arbitrary ctimes.)
 	InteropModeExclusive InteropMode = iota
 
 	// InteropModeWritethrough is appropriate when there are read-only users of
@@ -239,7 +247,13 @@ const (
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
+	// If UniqueID is non-empty, it is an opaque string used to reassociate the
+	// filesystem with a new server FD during restoration from checkpoint.
+	UniqueID string
+
 	// If LeakConnection is true, do not close the connection to the server
 	// when the Filesystem is released. This is necessary for deployments in
 	// which servers can handle only a single client and report failure if that
@@ -265,6 +279,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -276,46 +293,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	mopts := vfs.GenericParseMountOptions(opts.Data)
 	var fsopts filesystemOptions
 
-	// Check that the transport is "fd".
-	trans, ok := mopts["trans"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "trans")
-	if trans != "fd" {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
-		return nil, nil, syserror.EINVAL
-	}
-
-	// Check that read and write FDs are provided and identical.
-	rfdstr, ok := mopts["rfdno"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "rfdno")
-	rfd, err := strconv.Atoi(rfdstr)
+	fd, err := getFDFromMountOptionsMap(ctx, mopts)
 	if err != nil {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
-		return nil, nil, syserror.EINVAL
-	}
-	wfdstr, ok := mopts["wfdno"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "wfdno")
-	wfd, err := strconv.Atoi(wfdstr)
-	if err != nil {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
-		return nil, nil, syserror.EINVAL
-	}
-	if rfd != wfd {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
-		return nil, nil, syserror.EINVAL
+		return nil, nil, err
 	}
-	fsopts.fd = rfd
+	fsopts.fd = fd
 
 	// Get the attach name.
 	fsopts.aname = "/"
@@ -431,56 +413,43 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 	// If !ok, iopts being the zero value is correct.
 
-	// Establish a connection with the server.
-	conn, err := unet.NewSocket(fsopts.fd)
+	// Construct the filesystem object.
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
 	}
+	fs := &filesystem{
+		mfp:              mfp,
+		opts:             fsopts,
+		iopts:            iopts,
+		clock:            ktime.RealtimeClockFromContext(ctx),
+		devMinor:         devMinor,
+		syncableDentries: make(map[*dentry]struct{}),
+		specialFileFDs:   make(map[*specialFileFD]struct{}),
+		inoByQIDPath:     make(map[uint64]uint64),
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
-	// Perform version negotiation with the server.
-	ctx.UninterruptibleSleepStart(false)
-	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
-	ctx.UninterruptibleSleepFinish(false)
-	if err != nil {
-		conn.Close()
+	// Connect to the server.
+	if err := fs.dial(ctx); err != nil {
 		return nil, nil, err
 	}
-	// Ownership of conn has been transferred to client.
 
 	// Perform attach to obtain the filesystem root.
 	ctx.UninterruptibleSleepStart(false)
-	attached, err := client.Attach(fsopts.aname)
+	attached, err := fs.client.Attach(fsopts.aname)
 	ctx.UninterruptibleSleepFinish(false)
 	if err != nil {
-		client.Close()
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
 	attachFile := p9file{attached}
 	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
 	if err != nil {
 		attachFile.close(ctx)
-		client.Close()
-		return nil, nil, err
-	}
-
-	// Construct the filesystem object.
-	devMinor, err := vfsObj.GetAnonBlockDevMinor()
-	if err != nil {
-		attachFile.close(ctx)
-		client.Close()
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
-	fs := &filesystem{
-		mfp:              mfp,
-		opts:             fsopts,
-		iopts:            iopts,
-		client:           client,
-		clock:            ktime.RealtimeClockFromContext(ctx),
-		devMinor:         devMinor,
-		syncableDentries: make(map[*dentry]struct{}),
-		specialFileFDs:   make(map[*specialFileFD]struct{}),
-	}
-	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
 	// Construct the root dentry.
 	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
@@ -490,25 +459,87 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, err
 	}
 	// Set the root's reference count to 2. One reference is returned to the
-	// caller, and the other is deliberately leaked to prevent the root from
-	// being "cached" and subsequently evicted. Its resources will still be
-	// cleaned up by fs.Release().
+	// caller, and the other is held by fs to prevent the root from being "cached"
+	// and subsequently evicted.
 	root.refs = 2
+	fs.root = root
 
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
+func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok || trans != "fd" {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as 'trans=fd'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as 'rfdno=<file descriptor>'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: rfdno=%s", rfdstr)
+		return -1, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as 'wfdno=<file descriptor>'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: wfdno=%s", wfdstr)
+		return -1, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return -1, syserror.EINVAL
+	}
+	return rfd, nil
+}
+
+// Preconditions: fs.client == nil.
+func (fs *filesystem) dial(ctx context.Context) error {
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fs.opts.fd)
+	if err != nil {
+		return err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return err
+	}
+	// Ownership of conn has been transferred to client.
+
+	fs.client = client
+	return nil
+}
+
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
-	mf := fs.mfp.MemoryFile()
+	atomic.StoreInt32(&fs.released, 1)
 
+	mf := fs.mfp.MemoryFile()
 	fs.syncMu.Lock()
 	for d := range fs.syncableDentries {
 		d.handleMu.Lock()
 		d.dataMu.Lock()
 		if h := d.writeHandleLocked(); h.isOpen() {
 			// Write dirty cached data to the remote file.
-			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
 				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
 			}
 			// TODO(jamieliu): Do we need to flushf/fsync d?
@@ -529,6 +560,21 @@ func (fs *filesystem) Release(ctx context.Context) {
 	// fs.
 	fs.syncMu.Unlock()
 
+	// If leak checking is enabled, release all outstanding references in the
+	// filesystem. We deliberately avoid doing this outside of leak checking; we
+	// have released all external resources above rather than relying on dentry
+	// destructors.
+	if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
+		fs.renameMu.Lock()
+		fs.root.releaseSyntheticRecursiveLocked(ctx)
+		fs.evictAllCachedDentriesLocked(ctx)
+		fs.renameMu.Unlock()
+
+		// An extra reference was held by the filesystem on the root to prevent it from
+		// being cached/evicted.
+		fs.root.DecRef(ctx)
+	}
+
 	if !fs.iopts.LeakConnection {
 		// Close the connection to the server. This implicitly clunks all fids.
 		fs.client.Close()
@@ -537,7 +583,34 @@ func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 }
 
+// releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
+// the reference count on every synthetic dentry. Synthetic dentries have one
+// reference for existence that should be dropped during filesystem.Release.
+//
+// Precondition: d.fs.renameMu is locked.
+func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
+	if d.isSynthetic() {
+		d.decRefLocked()
+		d.checkCachingLocked(ctx)
+	}
+	if d.isDir() {
+		var children []*dentry
+		d.dirMu.Lock()
+		for _, child := range d.children {
+			children = append(children, child)
+		}
+		d.dirMu.Unlock()
+		for _, child := range children {
+			if child != nil {
+				child.releaseSyntheticRecursiveLocked(ctx)
+			}
+		}
+	}
+}
+
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -562,12 +635,15 @@ type dentry struct {
 	// filesystem.renameMu.
 	name string
 
+	// qidPath is the p9.QID.Path for this file. qidPath is immutable.
+	qidPath uint64
+
 	// file is the unopened p9.File that backs this dentry. file is immutable.
 	//
 	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
 	// that does not exist on the remote filesystem. As of this writing, the
 	// only files that can be synthetic are sockets, pipes, and directories.
-	file p9file
+	file p9file `state:"nosave"`
 
 	// If deleted is non-zero, the file represented by this dentry has been
 	// deleted. deleted is accessed using atomic memory operations.
@@ -579,7 +655,7 @@ type dentry struct {
 	cached bool
 	dentryEntry
 
-	dirMu sync.Mutex
+	dirMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a directory, children contains:
 	//
@@ -611,12 +687,12 @@ type dentry struct {
 	// To mutate:
 	//   - Lock metadataMu and use atomic operations to update because we might
 	//     have atomic readers that don't hold the lock.
-	metadataMu sync.Mutex
-	ino        inodeNumber // immutable
-	mode       uint32      // type is immutable, perms are mutable
-	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid        uint32      // auth.KGID, but ...
-	blockSize  uint32      // 0 if unknown
+	metadataMu sync.Mutex `state:"nosave"`
+	ino        uint64     // immutable
+	mode       uint32     // type is immutable, perms are mutable
+	uid        uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32     // auth.KGID, but ...
+	blockSize  uint32     // 0 if unknown
 	// Timestamps, all nsecs from the Unix epoch.
 	atime int64
 	mtime int64
@@ -642,7 +718,7 @@ type dentry struct {
 	// other metadata fields.
 	nlink uint32
 
-	mapsMu sync.Mutex
+	mapsMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a regular file, mappings tracks mappings of
 	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
@@ -666,12 +742,12 @@ type dentry struct {
 	// either p9.File transitions from closed (isNil() == true) to open
 	// (isNil() == false), it may be mutated with handleMu locked, but cannot
 	// be closed until the dentry is destroyed.
-	handleMu  sync.RWMutex
-	readFile  p9file
-	writeFile p9file
-	hostFD    int32
+	handleMu  sync.RWMutex `state:"nosave"`
+	readFile  p9file       `state:"nosave"`
+	writeFile p9file       `state:"nosave"`
+	hostFD    int32        `state:"nosave"`
 
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// If this dentry represents a regular file that is client-cached, cache
 	// maps offsets into the cached file to offsets into
@@ -703,6 +779,13 @@ type dentry struct {
 	locks vfs.FileLocks
 
 	// Inotify watches for this dentry.
+	//
+	// Note that inotify may behave unexpectedly in the presence of hard links,
+	// because dentries corresponding to the same file have separate inotify
+	// watches when they should share the same set. This is the case because it is
+	// impossible for us to know for sure whether two dentries correspond to the
+	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+	// a more in-depth discussion on this matter).
 	watches vfs.Watches
 }
 
@@ -739,8 +822,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 
 	d := &dentry{
 		fs:        fs,
+		qidPath:   qid.Path,
 		file:      file,
-		ino:       inoFromPath(qid.Path),
+		ino:       fs.inoFromQIDPath(qid.Path),
 		mode:      uint32(attr.Mode),
 		uid:       uint32(fs.opts.dfltuid),
 		gid:       uint32(fs.opts.dfltgid),
@@ -776,6 +860,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 		d.nlink = uint32(attr.NLink)
 	}
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "gofer.dentry")
+	}
 
 	fs.syncMu.Lock()
 	fs.syncableDentries[d] = struct{}{}
@@ -783,6 +870,21 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	return d, nil
 }
 
+func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
+	fs.inoMu.Lock()
+	defer fs.inoMu.Unlock()
+	if ino, ok := fs.inoByQIDPath[qidPath]; ok {
+		return ino
+	}
+	ino := fs.nextIno()
+	fs.inoByQIDPath[qidPath] = ino
+	return ino
+}
+
+func (fs *filesystem) nextIno() uint64 {
+	return atomic.AddUint64(&fs.lastIno, 1)
+}
+
 func (d *dentry) isSynthetic() bool {
 	return d.file.isNil()
 }
@@ -830,11 +932,11 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
 	}
 	if mask.Size {
-		d.updateFileSizeLocked(attr.Size)
+		d.updateSizeLocked(attr.Size)
 	}
 }
 
-// Preconditions: !d.isSynthetic()
+// Preconditions: !d.isSynthetic().
 func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	// Use d.readFile or d.writeFile, which represent 9P fids that have been
 	// opened, in preference to d.file, which represents a 9P fid that has not.
@@ -897,10 +999,10 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	// This is consistent with regularFileFD.Seek(), which treats regular files
 	// as having no holes.
 	stat.Blocks = (stat.Size + 511) / 512
-	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
-	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
-	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
-	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime))
+	stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime))
+	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime))
 	stat.DevMajor = linux.UNNAMED_MAJOR
 	stat.DevMinor = d.fs.devMinor
 }
@@ -948,10 +1050,10 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 		// Use client clocks for timestamps.
 		now = d.fs.clock.Now().Nanoseconds()
 		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
-			stat.Atime = statxTimestampFromDentry(now)
+			stat.Atime = linux.NsecToStatxTimestamp(now)
 		}
 		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
-			stat.Mtime = statxTimestampFromDentry(now)
+			stat.Mtime = linux.NsecToStatxTimestamp(now)
 		}
 	}
 
@@ -984,7 +1086,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 				// d.size should be kept up to date, and privatized
 				// copy-on-write mappings of truncated pages need to be
 				// invalidated, even if InteropModeShared is in effect.
-				d.updateFileSizeLocked(stat.Size)
+				d.updateSizeLocked(stat.Size)
 			}
 		}
 		if d.fs.opts.interop == InteropModeShared {
@@ -1010,19 +1112,42 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 	// !d.cachedMetadataAuthoritative() then we returned after calling
 	// d.file.setAttr(). For the same reason, now must have been initialized.
 	if stat.Mask&linux.STATX_ATIME != 0 {
-		atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
 		atomic.StoreUint32(&d.atimeDirty, 0)
 	}
 	if stat.Mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
 		atomic.StoreUint32(&d.mtimeDirty, 0)
 	}
 	atomic.StoreInt64(&d.ctime, now)
 	return nil
 }
 
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	// Allocating a smaller size is a noop.
+	size := offset + length
+	if d.cachedMetadataAuthoritative() && size <= d.size {
+		return nil
+	}
+
+	err := allocate()
+	if err != nil {
+		return err
+	}
+	d.updateSizeLocked(size)
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
 // Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
 	d.dataMu.Lock()
 	oldSize := d.size
 	atomic.StoreUint64(&d.size, newSize)
@@ -1060,6 +1185,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+	// there is no need to expose any other xattrs through a gofer.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
 }
@@ -1118,6 +1258,11 @@ func (d *dentry) decRefLocked() {
 	}
 }
 
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
 	if d.isDir() {
@@ -1166,6 +1311,10 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
 	refs := atomic.LoadInt64(&d.refs)
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
 	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
@@ -1174,10 +1323,6 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 		}
 		return
 	}
-	if refs == -1 {
-		// Dentry has already been destroyed.
-		return
-	}
 	// Deleted and invalidated dentries with zero references are no longer
 	// reachable by path resolution and should be dropped immediately.
 	if d.vfsd.IsDead() {
@@ -1200,6 +1345,16 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	if d.watches.Size() > 0 {
 		return
 	}
+
+	if atomic.LoadInt32(&d.fs.released) != 0 {
+		if d.parent != nil {
+			d.parent.dirMu.Lock()
+			delete(d.parent.children, d.name)
+			d.parent.dirMu.Unlock()
+		}
+		d.destroyLocked(ctx)
+	}
+
 	// If d is already cached, just move it to the front of the LRU.
 	if d.cached {
 		d.fs.cachedDentries.Remove(d)
@@ -1212,33 +1367,48 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	d.fs.cachedDentriesLen++
 	d.cached = true
 	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
-		victim := d.fs.cachedDentries.Back()
-		d.fs.cachedDentries.Remove(victim)
-		d.fs.cachedDentriesLen--
-		victim.cached = false
-		// victim.refs may have become non-zero from an earlier path resolution
-		// since it was inserted into fs.cachedDentries.
-		if atomic.LoadInt64(&victim.refs) == 0 {
-			if victim.parent != nil {
-				victim.parent.dirMu.Lock()
-				if !victim.vfsd.IsDead() {
-					// Note that victim can't be a mount point (in any mount
-					// namespace), since VFS holds references on mount points.
-					d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
-					delete(victim.parent.children, victim.name)
-					// We're only deleting the dentry, not the file it
-					// represents, so we don't need to update
-					// victimParent.dirents etc.
-				}
-				victim.parent.dirMu.Unlock()
-			}
-			victim.destroyLocked(ctx)
-		}
+		d.fs.evictCachedDentryLocked(ctx)
 		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
 		// back down to fs.opts.maxCachedDentries, so we don't loop.
 	}
 }
 
+// Precondition: fs.renameMu must be locked for writing; it may be temporarily
+// unlocked.
+func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
+	for fs.cachedDentriesLen != 0 {
+		fs.evictCachedDentryLocked(ctx)
+	}
+}
+
+// Preconditions:
+// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * fs.cachedDentriesLen != 0.
+func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
+	victim := fs.cachedDentries.Back()
+	fs.cachedDentries.Remove(victim)
+	fs.cachedDentriesLen--
+	victim.cached = false
+	// victim.refs may have become non-zero from an earlier path resolution
+	// since it was inserted into fs.cachedDentries.
+	if atomic.LoadInt64(&victim.refs) == 0 {
+		if victim.parent != nil {
+			victim.parent.dirMu.Lock()
+			if !victim.vfsd.IsDead() {
+				// Note that victim can't be a mount point (in any mount
+				// namespace), since VFS holds references on mount points.
+				fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+				delete(victim.parent.children, victim.name)
+				// We're only deleting the dentry, not the file it
+				// represents, so we don't need to update
+				// victimParent.dirents etc.
+			}
+			victim.parent.dirMu.Unlock()
+		}
+		victim.destroyLocked(ctx)
+	}
+}
+
 // destroyLocked destroys the dentry.
 //
 // Preconditions:
@@ -1293,30 +1463,19 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	d.handleMu.Unlock()
 
 	if !d.file.isNil() {
-		if !d.isDeleted() {
-			// Write dirty timestamps back to the remote filesystem.
-			atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
-			mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
-			if atimeDirty || mtimeDirty {
-				atime := atomic.LoadInt64(&d.atime)
-				mtime := atomic.LoadInt64(&d.mtime)
-				if err := d.file.setAttr(ctx, p9.SetAttrMask{
-					ATime:              atimeDirty,
-					ATimeNotSystemTime: atimeDirty,
-					MTime:              mtimeDirty,
-					MTimeNotSystemTime: mtimeDirty,
-				}, p9.SetAttr{
-					ATimeSeconds:     uint64(atime / 1e9),
-					ATimeNanoSeconds: uint64(atime % 1e9),
-					MTimeSeconds:     uint64(mtime / 1e9),
-					MTimeNanoSeconds: uint64(mtime % 1e9),
-				}); err != nil {
-					log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
-				}
-			}
+		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+		// i.e. client and server timestamps may differ (because e.g. a client
+		// write was serviced by the page cache, and only written back to the
+		// remote file later). Ideally, we'd write client timestamps back to
+		// the remote filesystem so that timestamps for a new dentry
+		// instantiated for the same file would remain coherent. Unfortunately,
+		// this turns out to be too expensive in many cases, so for now we
+		// don't do this.
+		if err := d.file.close(ctx); err != nil {
+			log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
 		}
-		d.file.close(ctx)
 		d.file = p9file{}
+
 		// Remove d from the set of syncable dentries.
 		d.fs.syncMu.Lock()
 		delete(d.fs.syncableDentries, d)
@@ -1334,6 +1493,10 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 			panic("gofer.dentry.DecRef() called without holding a reference")
 		}
 	}
+
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "gofer.dentry")
+	}
 }
 
 func (d *dentry) isDeleted() bool {
@@ -1344,9 +1507,7 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
 	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
@@ -1356,6 +1517,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	}
 	xattrs := make([]string, 0, len(xattrMap))
 	for x := range xattrMap {
+		// We only support xattrs in the user.* namespace.
 		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
 			xattrs = append(xattrs, x)
 		}
@@ -1363,51 +1525,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if d.file.isNil() {
 		return "", syserror.ENODATA
 	}
-	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.removeXattr(ctx, name)
 }
 
@@ -1418,7 +1562,9 @@ func (d *dentry) userXattrSupported() bool {
 	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1463,8 +1609,9 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 			return err
 		}
 
-		if d.hostFD < 0 && openReadable && h.fd >= 0 {
-			// We have no existing FD; use the new FD for at least reading.
+		if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+			// We have no existing FD, and the new FD meets the requirements
+			// for d.hostFD, so start using it.
 			d.hostFD = h.fd
 		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
 			// We have an existing read-only FD, but the file has just been
@@ -1593,6 +1740,33 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
 	return nil
 }
 
+func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	h := d.writeHandleLocked()
+	if h.isOpen() {
+		// Write back dirty pages to the remote file.
+		d.dataMu.Lock()
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
+	}
+	if err := d.syncRemoteFileLocked(ctx); err != nil {
+		if !forFilesystemSync {
+			return err
+		}
+		// Only return err if we can reasonably have expected sync to succeed
+		// (d is a regular file and was opened for writing).
+		if d.isRegularFile() && h.isOpen() {
+			return err
+		}
+		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
+	}
+	return nil
+}
+
 // incLinks increments link count.
 func (d *dentry) incLinks() {
 	if atomic.LoadUint32(&d.nlink) == 0 {
@@ -1613,12 +1787,14 @@ func (d *dentry) decLinks() {
 
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
 
-	lockLogging sync.Once
+	lockLogging sync.Once `state:"nosave"`
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1656,30 +1832,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index bfe75dfe4..76f08e252 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -26,12 +26,13 @@ import (
 func TestDestroyIdempotent(t *testing.T) {
 	ctx := contexttest.Context(t)
 	fs := filesystem{
-		mfp:              pgalloc.MemoryFileProviderFromContext(ctx),
-		syncableDentries: make(map[*dentry]struct{}),
+		mfp: pgalloc.MemoryFileProviderFromContext(ctx),
 		opts: filesystemOptions{
 			// Test relies on no dentry being held in the cache.
 			maxCachedDentries: 0,
 		},
+		syncableDentries: make(map[*dentry]struct{}),
+		inoByQIDPath:     make(map[uint64]uint64),
 	}
 
 	attr := &p9.Attr{
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 104157512..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
 
 // handle represents a remote "open file descriptor", consisting of an opened
 // fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
 type handle struct {
 	file p9file
 	fd   int32 // -1 if unavailable
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
index 7294de7d6..c7bf10007 100644
--- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -51,8 +51,24 @@ func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
 		if ok {
 			return nil
 		}
-		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
-			return err
+		if sleepErr := sleepBetweenNamedPipeOpenChecks(ctx); sleepErr != nil {
+			// Another application thread may have opened this pipe for
+			// writing, succeeded because we previously opened the pipe for
+			// reading, and subsequently interrupted us for checkpointing (e.g.
+			// this occurs in mknod tests under cooperative save/restore). In
+			// this case, our open has to succeed for the checkpoint to include
+			// a readable FD for the pipe, which is in turn necessary to
+			// restore the other thread's writable FD for the same pipe
+			// (otherwise it will get ENXIO). So we have to check
+			// nonblockingPipeHasWriter() once last time.
+			ok, err := nonblockingPipeHasWriter(fd)
+			if err != nil {
+				return err
+			}
+			if ok {
+				return nil
+			}
+			return sleepErr
 		}
 	}
 }
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
 	return err
 }
 
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttrClose(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	ctx.UninterruptibleSleepStart(false)
 	fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 7e1cbf065..dc8a890cb 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool {
 	return d.fileType() == linux.S_IFREG
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
 }
 
@@ -56,10 +57,16 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	if !fd.vfsfd.IsWritable() {
 		return nil
 	}
-	// Skip flushing if writes may be buffered by the client, since (as with
-	// the VFS1 client) we don't flush buffered writes on close anyway.
+	// Skip flushing if there are client-buffered writes, since (as with the
+	// VFS1 client) we don't flush buffered writes on close anyway.
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeExclusive {
+	if d.fs.opts.interop != InteropModeExclusive {
+		return nil
+	}
+	d.dataMu.RLock()
+	haveDirtyPages := !d.dirty.IsEmpty()
+	d.dataMu.RUnlock()
+	if haveDirtyPages {
 		return nil
 	}
 	d.handleMu.RLock()
@@ -73,28 +80,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 // Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
 	d := fd.dentry()
-	d.metadataMu.Lock()
-	defer d.metadataMu.Unlock()
-
-	// Allocating a smaller size is a noop.
-	size := offset + length
-	if d.cachedMetadataAuthoritative() && size <= d.size {
-		return nil
-	}
-
-	d.handleMu.RLock()
-	err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
-	d.handleMu.RUnlock()
-	if err != nil {
-		return err
-	}
-	d.dataMu.Lock()
-	atomic.StoreUint64(&d.size, size)
-	d.dataMu.Unlock()
-	if d.cachedMetadataAuthoritative() {
-		d.touchCMtimeLocked()
-	}
-	return nil
+	return d.doAllocate(ctx, offset, length, func() error {
+		d.handleMu.RLock()
+		defer d.handleMu.RUnlock()
+		return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	})
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
@@ -117,6 +107,10 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, io.EOF
 	}
 
+	var (
+		n       int64
+		readErr error
+	)
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Lock d.metadataMu for the rest of the read to prevent d.size from
 		// changing.
@@ -127,20 +121,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
 			return 0, err
 		}
-	}
-
-	rw := getDentryReadWriter(ctx, d, offset)
-	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		rw := getDentryReadWriter(ctx, d, offset)
 		// Require the read to go to the remote file.
 		rw.direct = true
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtimeLocked(fd.vfsfd.Mount())
+		}
+	} else {
+		rw := getDentryReadWriter(ctx, d, offset)
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtime(fd.vfsfd.Mount())
+		}
 	}
-	n, err := dst.CopyOutFrom(ctx, rw)
-	putDentryReadWriter(rw)
-	if d.fs.opts.interop != InteropModeShared {
-		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(fd.vfsfd.Mount())
-	}
-	return n, err
+	return n, readErr
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -396,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					End:   gapEnd,
 				}
 				optMR := gap.Range()
-				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
 				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
 				seg, gap = rw.d.cache.Find(rw.off)
 				if !seg.Ok() {
@@ -404,10 +403,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					rw.d.handleMu.RUnlock()
 					return done, err
 				}
-				// err might have occurred in part of gap.Range() outside
-				// gapMR. Forget about it for now; if the error matters and
-				// persists, we'll run into it again in a later iteration of
-				// this loop.
+				// err might have occurred in part of gap.Range() outside gapMR
+				// (in particular, gap.End() might be beyond EOF). Forget about
+				// it for now; if the error matters and persists, we'll run
+				// into it again in a later iteration of this loop.
 			} else {
 				// Read directly from the file.
 				gapDsts := dsts.TakeFirst64(gapMR.Length())
@@ -625,23 +624,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return fd.dentry().syncCachedFile(ctx)
-}
-
-func (d *dentry) syncCachedFile(ctx context.Context) error {
-	d.handleMu.RLock()
-	defer d.handleMu.RUnlock()
-
-	if h := d.writeHandleLocked(); h.isOpen() {
-		d.dataMu.Lock()
-		// Write dirty cached data to the remote file.
-		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
-		d.dataMu.Unlock()
-		if err != nil {
-			return err
-		}
-	}
-	return d.syncRemoteFileLocked(ctx)
+	return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
 }
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -781,7 +764,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
 
 	mf := d.fs.mfp.MemoryFile()
 	h := d.readHandleLocked()
-	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
@@ -900,6 +883,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 // dentryPlatformFile is only used when a host FD representing the remote file
 // is available (i.e. dentry.hostFD >= 0), and that FD is used for application
 // memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
 type dentryPlatformFile struct {
 	*dentry
 
@@ -912,7 +897,7 @@ type dentryPlatformFile struct {
 	hostFileMapper fsutil.HostFileMapper
 
 	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
-	hostFileMapperInitOnce sync.Once
+	hostFileMapperInitOnce sync.Once `state:"nosave"`
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
new file mode 100644
index 000000000..2ea224c43
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -0,0 +1,329 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type saveRestoreContextID int
+
+const (
+	// CtxRestoreServerFDMap is a Context.Value key for a map[string]int
+	// mapping filesystem unique IDs (cf. InternalFilesystemOptions.UniqueID)
+	// to host FDs.
+	CtxRestoreServerFDMap saveRestoreContextID = iota
+)
+
+// +stateify savable
+type savedDentryRW struct {
+	read  bool
+	write bool
+}
+
+// PreprareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave.
+func (fs *filesystem) PrepareSave(ctx context.Context) error {
+	if len(fs.iopts.UniqueID) == 0 {
+		return fmt.Errorf("gofer.filesystem with no UniqueID cannot be saved")
+	}
+
+	// Purge cached dentries, which may not be reopenable after restore due to
+	// permission changes.
+	fs.renameMu.Lock()
+	fs.evictAllCachedDentriesLocked(ctx)
+	fs.renameMu.Unlock()
+
+	// Buffer pipe data so that it's available for reading after restore. (This
+	// is a legacy VFS1 feature.)
+	fs.syncMu.Lock()
+	for sffd := range fs.specialFileFDs {
+		if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() {
+			if err := sffd.savePipeData(ctx); err != nil {
+				fs.syncMu.Unlock()
+				return err
+			}
+		}
+	}
+	fs.syncMu.Unlock()
+
+	// Flush local state to the remote filesystem.
+	if err := fs.Sync(ctx); err != nil {
+		return err
+	}
+
+	fs.savedDentryRW = make(map[*dentry]savedDentryRW)
+	return fs.root.prepareSaveRecursive(ctx)
+}
+
+// Preconditions:
+// * fd represents a pipe.
+// * fd is readable.
+func (fd *specialFileFD) savePipeData(ctx context.Context) error {
+	fd.bufMu.Lock()
+	defer fd.bufMu.Unlock()
+	var buf [usermem.PageSize]byte
+	for {
+		n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), ^uint64(0))
+		if n != 0 {
+			fd.buf = append(fd.buf, buf[:n]...)
+		}
+		if err != nil {
+			if err == io.EOF || err == syserror.EAGAIN {
+				break
+			}
+			return err
+		}
+	}
+	if len(fd.buf) != 0 {
+		atomic.StoreUint32(&fd.haveBuf, 1)
+	}
+	return nil
+}
+
+func (d *dentry) prepareSaveRecursive(ctx context.Context) error {
+	if d.isRegularFile() && !d.cachedMetadataAuthoritative() {
+		// Get updated metadata for d in case we need to perform metadata
+		// validation during restore.
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	if !d.readFile.isNil() || !d.writeFile.isNil() {
+		d.fs.savedDentryRW[d] = savedDentryRW{
+			read:  !d.readFile.isNil(),
+			write: !d.writeFile.isNil(),
+		}
+	}
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	for _, child := range d.children {
+		if child != nil {
+			if err := child.prepareSaveRecursive(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// beforeSave is invoked by stateify.
+func (d *dentry) beforeSave() {
+	if d.vfsd.IsDead() {
+		panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: deleted and invalidated dentries can't be restored", genericDebugPathname(d)))
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentry) afterLoad() {
+	d.hostFD = -1
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "gofer.dentry")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentryPlatformFile) afterLoad() {
+	if d.hostFileMapper.IsInited() {
+		// Ensure that we don't call d.hostFileMapper.Init() again.
+		d.hostFileMapperInitOnce.Do(func() {})
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (fd *specialFileFD) afterLoad() {
+	fd.handle.fd = -1
+}
+
+// CompleteRestore implements
+// vfs.FilesystemImplSaveRestoreExtension.CompleteRestore.
+func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error {
+	fdmapv := ctx.Value(CtxRestoreServerFDMap)
+	if fdmapv == nil {
+		return fmt.Errorf("no server FD map available")
+	}
+	fdmap := fdmapv.(map[string]int)
+	fd, ok := fdmap[fs.iopts.UniqueID]
+	if !ok {
+		return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID)
+	}
+	fs.opts.fd = fd
+	if err := fs.dial(ctx); err != nil {
+		return err
+	}
+	fs.inoByQIDPath = make(map[uint64]uint64)
+
+	// Restore the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := fs.client.Attach(fs.opts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		return err
+	}
+	if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil {
+		return err
+	}
+
+	// Restore remaining dentries.
+	if err := fs.root.restoreDescendantsRecursive(ctx, &opts); err != nil {
+		return err
+	}
+
+	// Re-open handles for specialFileFDs. Unlike the initial open
+	// (dentry.openSpecialFile()), pipes are always opened without blocking;
+	// non-readable pipe FDs are opened last to ensure that they don't get
+	// ENXIO if another specialFileFD represents the read end of the same pipe.
+	// This is consistent with VFS1.
+	haveWriteOnlyPipes := false
+	for fd := range fs.specialFileFDs {
+		if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+			haveWriteOnlyPipes = true
+			continue
+		}
+		if err := fd.completeRestore(ctx); err != nil {
+			return err
+		}
+	}
+	if haveWriteOnlyPipes {
+		for fd := range fs.specialFileFDs {
+			if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+				if err := fd.completeRestore(ctx); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	// Discard state only required during restore.
+	fs.savedDentryRW = nil
+
+	return nil
+}
+
+func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrMask p9.AttrMask, attr *p9.Attr, opts *vfs.CompleteRestoreOptions) error {
+	d.file = file
+
+	// Gofers do not preserve QID across checkpoint/restore, so:
+	//
+	// - We must assume that the remote filesystem did not change in a way that
+	// would invalidate dentries, since we can't revalidate dentries by
+	// checking QIDs.
+	//
+	// - We need to associate the new QID.Path with the existing d.ino.
+	d.qidPath = qid.Path
+	d.fs.inoMu.Lock()
+	d.fs.inoByQIDPath[qid.Path] = d.ino
+	d.fs.inoMu.Unlock()
+
+	// Check metadata stability before updating metadata.
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if d.isRegularFile() {
+		if opts.ValidateFileSizes {
+			if !attrMask.Size {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d))
+			}
+			if d.size != attr.Size {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, attr.Size)
+			}
+		}
+		if opts.ValidateFileModificationTimestamps {
+			if !attrMask.MTime {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d))
+			}
+			if want := dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds); d.mtime != want {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want))
+			}
+		}
+	}
+	if !d.cachedMetadataAuthoritative() {
+		d.updateFromP9AttrsLocked(attrMask, attr)
+	}
+
+	if rw, ok := d.fs.savedDentryRW[d]; ok {
+		if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Preconditions: d is not synthetic.
+func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+	for _, child := range d.children {
+		if child == nil {
+			continue
+		}
+		if _, ok := d.fs.syncableDentries[child]; !ok {
+			// child is synthetic.
+			continue
+		}
+		if err := child.restoreRecursive(ctx, opts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Preconditions: d is not synthetic (but note that since this function
+// restores d.file, d.file.isNil() is always true at this point, so this can
+// only be detected by checking filesystem.syncableDentries). d.parent has been
+// restored.
+func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+	qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name)
+	if err != nil {
+		return err
+	}
+	if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil {
+		return err
+	}
+	return d.restoreDescendantsRecursive(ctx, opts)
+}
+
+func (fd *specialFileFD) completeRestore(ctx context.Context) error {
+	d := fd.dentry()
+	h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+	if err != nil {
+		return err
+	}
+	fd.handle = h
+
+	ftype := d.fileType()
+	fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0
+	if fd.haveQueue {
+		if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index 85d2bee72..a21199eac 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,13 +36,12 @@ func (d *dentry) isSocket() bool {
 // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
 // is called and either BoundEndpoint.BidirectionalConnect or
 // BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
 type endpoint struct {
 	// dentry is the filesystem dentry which produced this endpoint.
 	dentry *dentry
 
-	// file is the p9 file that contains a single unopened fid.
-	file p9.File
-
 	// path is the sentry path where this endpoint is bound.
 	path string
 }
@@ -114,7 +113,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
 }
 
 func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
-	hostFile, err := e.file.Connect(flags)
+	hostFile, err := e.dentry.file.connect(ctx, flags)
 	if err != nil {
 		return nil, syserr.ErrConnectionRefused
 	}
@@ -129,7 +128,7 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla
 
 	c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
 	if serr != nil {
-		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr)
 		return nil, serr
 	}
 	return c, nil
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c39aa9b7..625400c0b 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -15,15 +15,16 @@
 package gofer
 
 import (
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -33,11 +34,13 @@ import (
 // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
 // in effect) regular files. specialFileFD differs from regularFileFD by using
 // per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
 type specialFileFD struct {
 	fileDescription
 
 	// handle is used for file I/O. handle is immutable.
-	handle handle
+	handle handle `state:"nosave"`
 
 	// isRegularFile is true if this FD represents a regular file which is only
 	// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
@@ -51,12 +54,20 @@ type specialFileFD struct {
 
 	// haveQueue is true if this file description represents a file for which
 	// queue may send I/O readiness events. haveQueue is immutable.
-	haveQueue bool
+	haveQueue bool `state:"nosave"`
 	queue     waiter.Queue
 
 	// If seekable is true, off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
+
+	// If haveBuf is non-zero, this FD represents a pipe, and buf contains data
+	// read from the pipe from previous calls to specialFileFD.savePipeData().
+	// haveBuf and buf are protected by bufMu. haveBuf is accessed using atomic
+	// memory operations.
+	bufMu   sync.Mutex `state:"nosave"`
+	haveBuf uint32
+	buf     []byte
 }
 
 func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
@@ -84,6 +95,9 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 		}
 		return nil, err
 	}
+	d.fs.syncMu.Lock()
+	d.fs.specialFileFDs[fd] = struct{}{}
+	d.fs.syncMu.Unlock()
 	return fd, nil
 }
 
@@ -135,6 +149,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
 	fd.fileDescription.EventUnregister(e)
 }
 
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if fd.isRegularFile {
+		d := fd.dentry()
+		return d.doAllocate(ctx, offset, length, func() error {
+			return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+		})
+	}
+	return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
@@ -148,26 +172,51 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	// Going through dst.CopyOutFrom() holds MM locks around file operations of
-	// unknown duration. For regularFileFD, doing so is necessary to support
-	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
-	// hold here since specialFileFD doesn't client-cache data. Just buffer the
-	// read instead.
 	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
+
+	bufN := int64(0)
+	if atomic.LoadUint32(&fd.haveBuf) != 0 {
+		var err error
+		fd.bufMu.Lock()
+		if len(fd.buf) != 0 {
+			var n int
+			n, err = dst.CopyOut(ctx, fd.buf)
+			dst = dst.DropFirst(n)
+			fd.buf = fd.buf[n:]
+			if len(fd.buf) == 0 {
+				atomic.StoreUint32(&fd.haveBuf, 0)
+				fd.buf = nil
+			}
+			bufN = int64(n)
+			if offset >= 0 {
+				offset += bufN
+			}
+		}
+		fd.bufMu.Unlock()
+		if err != nil {
+			return bufN, err
+		}
+	}
+
+	// Going through dst.CopyOutFrom() would hold MM locks around file
+	// operations of unknown duration. For regularFileFD, doing so is necessary
+	// to support mmap due to lock ordering; MM locks precede dentry.dataMu.
+	// That doesn't hold here since specialFileFD doesn't client-cache data.
+	// Just buffer the read instead.
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
 	if n == 0 {
-		return 0, err
+		return bufN, err
 	}
 	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
-		return int64(cp), cperr
+		return bufN + int64(cp), cperr
 	}
-	return int64(n), err
+	return bufN + int64(n), err
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -204,16 +253,16 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 	}
 
 	d := fd.dentry()
-	// If the regular file fd was opened with O_APPEND, make sure the file size
-	// is updated. There is a possible race here if size is modified externally
-	// after metadata cache is updated.
-	if fd.isRegularFile && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
-		if err := d.updateFromGetattr(ctx); err != nil {
-			return 0, offset, err
+	if fd.isRegularFile {
+		// If the regular file fd was opened with O_APPEND, make sure the file
+		// size is updated. There is a possible race here if size is modified
+		// externally after metadata cache is updated.
+		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, offset, err
+			}
 		}
-	}
 
-	if fd.isRegularFile {
 		// We need to hold the metadataMu *while* writing to a regular file.
 		d.metadataMu.Lock()
 		defer d.metadataMu.Unlock()
@@ -235,11 +284,12 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
-	// Don't do partial writes if we get a partial read from src.
-	if _, err := src.CopyIn(ctx, buf); err != nil {
-		return 0, offset, err
+	copied, copyErr := src.CopyIn(ctx, buf)
+	if copied == 0 && copyErr != nil {
+		// Only return the error if we didn't get any data.
+		return 0, offset, copyErr
 	}
-	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
@@ -256,7 +306,10 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 			atomic.StoreUint64(&d.size, uint64(offset))
 		}
 	}
-	return int64(n), offset, err
+	if err != nil {
+		return int64(n), offset, err
+	}
+	return int64(n), offset, copyErr
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
@@ -289,13 +342,31 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *specialFileFD) Sync(ctx context.Context) error {
-	// If we have a host FD, fsyncing it is likely to be faster than an fsync
-	// RPC.
-	if fd.handle.fd >= 0 {
-		ctx.UninterruptibleSleepStart(false)
-		err := syscall.Fsync(int(fd.handle.fd))
-		ctx.UninterruptibleSleepFinish(false)
-		return err
+	return fd.sync(ctx, false /* forFilesystemSync */)
+}
+
+func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error {
+	err := func() error {
+		// If we have a host FD, fsyncing it is likely to be faster than an fsync
+		// RPC.
+		if fd.handle.fd >= 0 {
+			ctx.UninterruptibleSleepStart(false)
+			err := syscall.Fsync(int(fd.handle.fd))
+			ctx.UninterruptibleSleepFinish(false)
+			return err
+		}
+		return fd.handle.file.fsync(ctx)
+	}()
+	if err != nil {
+		if !forFilesystemSync {
+			return err
+		}
+		// Only return err if we can reasonably have expected sync to succeed
+		// (fd represents a regular file that was opened for writing).
+		if fd.isRegularFile && fd.vfsfd.IsWritable() {
+			return err
+		}
+		ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err)
 	}
-	return fd.handle.file.fsync(ctx)
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index e59d07e90..9cbe805b9 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -17,7 +17,6 @@ package gofer
 import (
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -25,17 +24,6 @@ func dentryTimestampFromP9(s, ns uint64) int64 {
 	return int64(s*1e9 + ns)
 }
 
-func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
-	return ts.Sec*1e9 + int64(ts.Nsec)
-}
-
-func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
-	return linux.StatxTimestamp{
-		Sec:  ns / 1e9,
-		Nsec: uint32(ns % 1e9),
-	}
-}
-
 // Preconditions: d.cachedMetadataAuthoritative() == true.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	if mnt.Flags.NoATime || mnt.ReadOnly() {
@@ -52,8 +40,23 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.atime, now)
+	atomic.StoreUint32(&d.atimeDirty, 1)
+	mnt.EndWrite()
+}
+
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -61,8 +64,9 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -72,8 +76,9 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// locked d.metadataMu.
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index bd701bbc7..dc0f86061 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,14 +1,40 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "host",
+    prefix = "inode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
+go_template_instance(
+    name = "connected_endpoint_refs",
+    out = "connected_endpoint_refs.go",
+    package = "host",
+    prefix = "ConnectedEndpoint",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "ConnectedEndpoint",
+    },
+)
+
 go_library(
     name = "host",
     srcs = [
+        "connected_endpoint_refs.go",
         "control.go",
         "host.go",
+        "inode_refs.go",
         "ioctl_unsafe.go",
         "mmap.go",
+        "save_restore.go",
         "socket.go",
         "socket_iovec.go",
         "socket_unsafe.go",
@@ -24,7 +50,9 @@ go_library(
         "//pkg/fspath",
         "//pkg/iovec",
         "//pkg/log",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
index 0135e4428..13ef48cb5 100644
--- a/pkg/sentry/fsimpl/host/control.go
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -79,7 +79,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
 		}
 
 		// Create the file backed by hostFD.
-		file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+		file, err := NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &NewFDOptions{})
 		if err != nil {
 			ctx.Warningf("Error creating file from host FD: %v", err)
 			break
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 56869f59a..eeed0f97d 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -19,6 +19,7 @@ package host
 import (
 	"fmt"
 	"math"
+	"sync/atomic"
 	"syscall"
 
 	"golang.org/x/sys/unix"
@@ -27,7 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
@@ -41,8 +41,123 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// inode implements kernfs.Inode.
+//
+// +stateify savable
+type inode struct {
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
+
+	locks vfs.FileLocks
+
+	// When the reference count reaches zero, the host fd is closed.
+	inodeRefs
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
+	ino uint64
+
+	// ftype is the file's type (a linux.S_IFMT mask).
+	//
+	// This field is initialized at creation time and is immutable.
+	ftype uint16
+
+	// mayBlock is true if hostFD is non-blocking, and operations on it may
+	// return EAGAIN or EWOULDBLOCK instead of blocking.
+	//
+	// This field is initialized at creation time and is immutable.
+	mayBlock bool
+
+	// seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
+	// offsets are meaningful iff seekable is true.
+	//
+	// This field is initialized at creation time and is immutable.
+	seekable bool
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// savable is true if hostFD may be saved/restored by its numeric value.
+	//
+	// This field is initialized at creation time and is immutable.
+	savable bool
+
+	// Event queue for blocking operations.
+	queue waiter.Queue
+
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// If this file is mmappable, mappings tracks mappings of hostFD into
+	// memmap.MappingSpaces.
+	mappings memmap.MappingSet
+
+	// pf implements platform.File for mappings of hostFD.
+	pf inodePlatformFile
+
+	// If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
+	// read from the pipe from previous calls to inode.beforeSave(). haveBuf
+	// and buf are protected by bufMu. haveBuf is accessed using atomic memory
+	// operations.
+	bufMu   sync.Mutex `state:"nosave"`
+	haveBuf uint32
+	buf     []byte
+}
+
+func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) {
+	// Determine if hostFD is seekable.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+	// We expect regular files to be seekable, as this is required for them to
+	// be memory-mappable.
+	if !seekable && fileType == syscall.S_IFREG {
+		ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
+		return nil, syserror.ESPIPE
+	}
+
+	i := &inode{
+		hostFD:   hostFD,
+		ino:      fs.NextIno(),
+		ftype:    uint16(fileType),
+		mayBlock: fileType != syscall.S_IFREG && fileType != syscall.S_IFDIR,
+		seekable: seekable,
+		isTTY:    isTTY,
+		savable:  savable,
+	}
+	i.pf.inode = i
+	i.EnableLeakCheck()
+
+	// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
+	// handle blocking behavior in the sentry.
+	if i.mayBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+	return i, nil
+}
+
 // NewFDOptions contains options to NewFD.
 type NewFDOptions struct {
+	// If Savable is true, the host file descriptor may be saved/restored by
+	// numeric value; the sandbox API requires a corresponding host FD with the
+	// same numeric value to be provieded at time of restore.
+	Savable bool
+
 	// If IsTTY is true, the file descriptor is a TTY.
 	IsTTY bool
 
@@ -76,45 +191,12 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		flags = uint32(flagsInt)
 	}
 
-	fileMode := linux.FileMode(s.Mode)
-	fileType := fileMode.FileType()
-
-	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
-	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
-	// devices.
-	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
-	seekable := err != syserror.ESPIPE
-
-	i := &inode{
-		hostFD:     hostFD,
-		ino:        fs.NextIno(),
-		isTTY:      opts.IsTTY,
-		wouldBlock: wouldBlock(uint32(fileType)),
-		seekable:   seekable,
-		// NOTE(b/38213152): Technically, some obscure char devices can be memory
-		// mapped, but we only allow regular files.
-		canMap: fileType == linux.S_IFREG,
-	}
-	i.pf.inode = i
-
-	// Non-seekable files can't be memory mapped, assert this.
-	if !i.seekable && i.canMap {
-		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-	}
-
-	// If the hostFD would block, we must set it to non-blocking and handle
-	// blocking behavior in the sentry.
-	if i.wouldBlock {
-		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
-			return nil, err
-		}
-		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
-			return nil, err
-		}
-	}
-
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+	if err != nil {
+		return nil, err
+	}
+	d.Init(&fs.Filesystem, i)
 
 	// i.open will take a reference on d.
 	defer d.DecRef(ctx)
@@ -122,29 +204,35 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 	// For simplicity, fileDescription.offset is set to 0. Technically, we
 	// should only set to 0 on files that are not seekable (sockets, pipes,
 	// etc.), and use the offset from the host fd otherwise when importing.
-	return i.open(ctx, d.VFSDentry(), mnt, flags)
+	return i.open(ctx, d, mnt, flags)
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
 func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
 	return NewFD(ctx, mnt, hostFD, &NewFDOptions{
-		IsTTY: isTTY,
+		Savable: true,
+		IsTTY:   isTTY,
 	})
 }
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("host.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 func (filesystemType) Name() string {
 	return "none"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // NewFilesystem sets up and returns a new hostfs filesystem.
 //
 // Note that there should only ever be one instance of host.filesystem,
@@ -162,6 +250,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -180,65 +270,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	return vfs.PrependPathSyntheticError{}
 }
 
-// inode implements kernfs.Inode.
-type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// When the reference count reaches zero, the host fd is closed.
-	refs.AtomicRefCount
-
-	// hostFD contains the host fd that this file was originally created from,
-	// which must be available at time of restore.
-	//
-	// This field is initialized at creation time and is immutable.
-	hostFD int
-
-	// ino is an inode number unique within this filesystem.
-	//
-	// This field is initialized at creation time and is immutable.
-	ino uint64
-
-	// isTTY is true if this file represents a TTY.
-	//
-	// This field is initialized at creation time and is immutable.
-	isTTY bool
-
-	// seekable is false if the host fd points to a file representing a stream,
-	// e.g. a socket or a pipe. Such files are not seekable and can return
-	// EWOULDBLOCK for I/O operations.
-	//
-	// This field is initialized at creation time and is immutable.
-	seekable bool
-
-	// wouldBlock is true if the host FD would return EWOULDBLOCK for
-	// operations that would block.
-	//
-	// This field is initialized at creation time and is immutable.
-	wouldBlock bool
-
-	// Event queue for blocking operations.
-	queue waiter.Queue
-
-	// canMap specifies whether we allow the file to be memory mapped.
-	//
-	// This field is initialized at creation time and is immutable.
-	canMap bool
-
-	// mapsMu protects mappings.
-	mapsMu sync.Mutex
-
-	// If canMap is true, mappings tracks mappings of hostFD into
-	// memmap.MappingSpaces.
-	mappings memmap.MappingSet
-
-	// pf implements platform.File for mappings of hostFD.
-	pf inodePlatformFile
-}
-
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -247,7 +279,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
 }
 
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
 func (i *inode) Mode() linux.FileMode {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -258,7 +290,7 @@ func (i *inode) Mode() linux.FileMode {
 	return linux.FileMode(s.Mode)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	if opts.Mask&linux.STATX__RESERVED != 0 {
 		return linux.Statx{}, syserror.EINVAL
@@ -371,7 +403,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
 	}, nil
 }
 
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	s := &opts.Stat
 
@@ -430,10 +462,10 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	return nil
 }
 
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
 func (i *inode) DecRef(ctx context.Context) {
-	i.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		if i.wouldBlock {
+	i.inodeRefs.DecRef(func() {
+		if i.mayBlock {
 			fdnotifier.RemoveFD(int32(i.hostFD))
 		}
 		if err := unix.Close(i.hostFD); err != nil {
@@ -442,16 +474,16 @@ func (i *inode) DecRef(ctx context.Context) {
 	})
 }
 
-// Open implements kernfs.Inode.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
 	if i.Mode().FileType() == linux.S_IFSOCK {
 		return nil, syserror.ENXIO
 	}
-	return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+	return i.open(ctx, d, rp.Mount(), opts.Flags)
 }
 
-func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return nil, err
@@ -475,17 +507,17 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
 
 	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
 		if i.isTTY {
 			fd := &TTYFileDescription{
 				fileDescription: fileDescription{inode: i},
-				termios:         linux.DefaultSlaveTermios,
+				termios:         linux.DefaultReplicaTermios,
 			}
 			fd.LockFD.Init(&i.locks)
 			vfsfd := &fd.vfsfd
-			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 				return nil, err
 			}
 			return vfsfd, nil
@@ -494,7 +526,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 		fd := &fileDescription{inode: i}
 		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
-		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return vfsfd, nil
@@ -506,6 +538,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -520,41 +554,43 @@ type fileDescription struct {
 	inode *inode
 
 	// offsetMu protects offset.
-	offsetMu sync.Mutex
+	offsetMu sync.Mutex `state:"nosave"`
 
 	// offset specifies the current file offset. It is only meaningful when
 	// inode.seekable is true.
 	offset int64
 }
 
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
 	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
 }
 
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
 }
 
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (f *fileDescription) Release(context.Context) {
 	// noop
 }
 
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
-	if !f.inode.seekable {
-		return syserror.ESPIPE
-	}
-
-	// TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
-	return syserror.EOPNOTSUPP
+	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
 }
 
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	i := f.inode
 	if !i.seekable {
 		return 0, syserror.ESPIPE
@@ -563,21 +599,33 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
 	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
 }
 
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	i := f.inode
 	if !i.seekable {
+		bufN, err := i.readFromBuf(ctx, &dst)
+		if err != nil {
+			return bufN, err
+		}
 		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		total := bufN + n
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
 			// rather than retrying until complete.
-			if n != 0 {
+			if total != 0 {
 				err = nil
 			} else {
 				err = syserror.ErrWouldBlock
 			}
 		}
-		return n, err
+		return total, err
 	}
 
 	f.offsetMu.Lock()
@@ -587,20 +635,33 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 	return n, err
 }
 
-func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// Check that flags are supported.
-	//
-	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
-	if flags&^linux.RWF_HIPRI != 0 {
-		return 0, syserror.EOPNOTSUPP
+func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
+	if atomic.LoadUint32(&i.haveBuf) == 0 {
+		return 0, nil
+	}
+	i.bufMu.Lock()
+	defer i.bufMu.Unlock()
+	if len(i.buf) == 0 {
+		return 0, nil
 	}
+	n, err := dst.CopyOut(ctx, i.buf)
+	*dst = dst.DropFirst(n)
+	i.buf = i.buf[n:]
+	if len(i.buf) == 0 {
+		atomic.StoreUint32(&i.haveBuf, 0)
+		i.buf = nil
+	}
+	return int64(n), err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
 	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := dst.CopyOutFrom(ctx, reader)
 	hostfd.PutReadWriterAt(reader)
 	return int64(n), err
 }
 
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	if !f.inode.seekable {
 		return 0, syserror.ESPIPE
@@ -609,7 +670,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
 	return f.writeToHostFD(ctx, src, offset, opts.Flags)
 }
 
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
@@ -657,7 +718,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
 	return int64(n), err
 }
 
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 //
 // Note that we do not support seeking on directories, since we do not even
 // allow directory fds to be imported at all.
@@ -722,15 +783,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 	return f.offset, nil
 }
 
-// Sync implements FileDescriptionImpl.
-func (f *fileDescription) Sync(context.Context) error {
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (f *fileDescription) Sync(ctx context.Context) error {
 	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
 	return unix.Fsync(f.inode.hostFD)
 }
 
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
-	if !f.inode.canMap {
+	// NOTE(b/38213152): Technically, some obscure char devices can be memory
+	// mapped, but we only allow regular files.
+	if f.inode.ftype != syscall.S_IFREG {
 		return syserror.ENODEV
 	}
 	i := f.inode
@@ -741,13 +804,17 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
 // EventRegister implements waiter.Waitable.EventRegister.
 func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	f.inode.queue.EventRegister(e, mask)
-	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	if f.inode.mayBlock {
+		fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	}
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (f *fileDescription) EventUnregister(e *waiter.Entry) {
 	f.inode.queue.EventUnregister(e)
-	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	if f.inode.mayBlock {
+		fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	}
 }
 
 // Readiness uses the poll() syscall to check the status of the underlying FD.
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 65d3af38c..3d7eb2f96 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -27,11 +27,13 @@ import (
 // cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
 //
 // inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
 type inodePlatformFile struct {
 	*inode
 
 	// fdRefsMu protects fdRefs.
-	fdRefsMu sync.Mutex
+	fdRefsMu sync.Mutex `state:"nosave"`
 
 	// fdRefs counts references on memmap.File offsets. It is used solely for
 	// memory accounting.
@@ -41,7 +43,7 @@ type inodePlatformFile struct {
 	fileMapper fsutil.HostFileMapper
 
 	// fileMapperInitOnce is used to lazily initialize fileMapper.
-	fileMapperInitOnce sync.Once
+	fileMapperInitOnce sync.Once `state:"nosave"`
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/host/save_restore.go b/pkg/sentry/fsimpl/host/save_restore.go
new file mode 100644
index 000000000..7e32a8863
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/save_restore.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inode) beforeSave() {
+	if !i.savable {
+		panic("host.inode is not savable")
+	}
+	if i.ftype == syscall.S_IFIFO {
+		// If this pipe FD is readable, drain it so that bytes in the pipe can
+		// be read after restore. (This is a legacy VFS1 feature.) We don't
+		// know if the pipe FD is readable, so just try reading and tolerate
+		// EBADF from the read.
+		i.bufMu.Lock()
+		defer i.bufMu.Unlock()
+		var buf [usermem.PageSize]byte
+		for {
+			n, err := hostfd.Preadv2(int32(i.hostFD), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), -1 /* offset */, 0 /* flags */)
+			if n != 0 {
+				i.buf = append(i.buf, buf[:n]...)
+			}
+			if err != nil {
+				if err == io.EOF || err == syscall.EAGAIN || err == syscall.EBADF {
+					break
+				}
+				panic(fmt.Errorf("host.inode.beforeSave: buffering from pipe failed: %v", err))
+			}
+		}
+		if len(i.buf) != 0 {
+			atomic.StoreUint32(&i.haveBuf, 1)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inode) afterLoad() {
+	if i.mayBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			panic(fmt.Sprintf("host.inode.afterLoad: failed to set host FD %d non-blocking: %v", i.hostFD, err))
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			panic(fmt.Sprintf("host.inode.afterLoad: fdnotifier.AddFD(%d) failed: %v", i.hostFD, err))
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodePlatformFile) afterLoad() {
+	if i.fileMapper.IsInited() {
+		// Ensure that we don't call i.fileMapper.Init() again.
+		i.fileMapperInitOnce.Do(func() {})
+	}
+}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 4979dd0a9..8a447e29f 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -59,8 +58,7 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 //
 // +stateify savable
 type ConnectedEndpoint struct {
-	// ref keeps track of references to a ConnectedEndpoint.
-	ref refs.AtomicRefCount
+	ConnectedEndpointRefs
 
 	// mu protects fd below.
 	mu sync.RWMutex `state:"nosave"`
@@ -132,9 +130,9 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
 
@@ -318,7 +316,7 @@ func (c *ConnectedEndpoint) destroyLocked() {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (c *ConnectedEndpoint) Release(ctx context.Context) {
-	c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	c.DecRef(func() {
 		c.mu.Lock()
 		c.destroyLocked()
 		c.mu.Unlock()
@@ -348,12 +346,12 @@ func (e *SCMConnectedEndpoint) Init() error {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
-	e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	e.DecRef(func() {
 		e.mu.Lock()
+		fdnotifier.RemoveFD(int32(e.fd))
 		if err := syscall.Close(e.fd); err != nil {
 			log.Warningf("Failed to close host fd %d: %v", err)
 		}
-		fdnotifier.RemoveFD(int32(e.fd))
 		e.destroyLocked()
 		e.mu.Unlock()
 	})
@@ -378,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
 	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
 
 	if n > length {
-		return length, n, msg.Controllen, controlTrunc, err
+		return length, n, msg.Controllen, controlTrunc, nil
 	}
 
-	return n, n, msg.Controllen, controlTrunc, err
+	return n, n, msg.Controllen, controlTrunc, nil
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 27cbd3059..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,6 +30,8 @@ import (
 
 // TTYFileDescription implements vfs.FileDescriptionImpl for a host file
 // descriptor that wraps a TTY FD.
+//
+// +stateify savable
 type TTYFileDescription struct {
 	fileDescription
 
@@ -75,7 +78,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
 	t.fileDescription.Release(ctx)
 }
 
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -93,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
 	return t.fileDescription.PRead(ctx, dst, offset, opts)
 }
 
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -111,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
 	return t.fileDescription.Read(ctx, dst, opts)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -126,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.PWrite(ctx, src, offset, opts)
 }
 
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -141,8 +144,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.Write(ctx, src, opts)
 }
 
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0]. This is the real FD:
 	fd := t.inode.hostFD
 	ioctl := args[1].Uint64()
@@ -152,9 +160,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -166,9 +172,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -192,10 +196,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		defer t.mu.Unlock()
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -203,11 +205,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -226,12 +223,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -260,9 +256,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -273,9 +267,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 412bdb2eb..b2f43a119 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -43,12 +43,6 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
 	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
 }
 
-// wouldBlock returns true for file types that can return EWOULDBLOCK
-// for blocking operations, e.g. pipes, character devices, and sockets.
-func wouldBlock(fileType uint32) bool {
-	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
-}
-
 // isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
 // If so, they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 3835557fe..aaad67ab8 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -4,6 +4,18 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 licenses(["notice"])
 
 go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "kernfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Dentry",
+        "Linker": "*Dentry",
+    },
+)
+
+go_template_instance(
     name = "fstree",
     out = "fstree.go",
     package = "kernfs",
@@ -26,9 +38,54 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "static_directory_refs",
+    out = "static_directory_refs.go",
+    package = "kernfs",
+    prefix = "StaticDirectory",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "StaticDirectory",
+    },
+)
+
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "kernfs_test",
+    prefix = "dir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
+go_template_instance(
+    name = "readonly_dir_refs",
+    out = "readonly_dir_refs.go",
+    package = "kernfs_test",
+    prefix = "readonlyDir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "readonlyDir",
+    },
+)
+
+go_template_instance(
+    name = "synthetic_directory_refs",
+    out = "synthetic_directory_refs.go",
+    package = "kernfs",
+    prefix = "syntheticDirectory",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "syntheticDirectory",
+    },
+)
+
 go_library(
     name = "kernfs",
     srcs = [
+        "dentry_list.go",
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
@@ -36,7 +93,10 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "static_directory_refs.go",
         "symlink.go",
+        "synthetic_directory.go",
+        "synthetic_directory_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -45,8 +105,11 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
@@ -59,11 +122,18 @@ go_library(
 go_test(
     name = "kernfs_test",
     size = "small",
-    srcs = ["kernfs_test.go"],
+    srcs = [
+        "dir_refs.go",
+        "kernfs_test.go",
+        "readonly_dir_refs.go",
+    ],
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 12adf727a..485504995 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -35,6 +35,7 @@ import (
 // +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
+	InodeNoStatFS
 	InodeNoopRefCount
 	InodeNotDirectory
 	InodeNotSymlink
@@ -46,18 +47,18 @@ type DynamicBytesFile struct {
 var _ Inode = (*DynamicBytesFile)(nil)
 
 // Init initializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 	f.data = data
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,12 +87,12 @@ type DynamicBytesFD struct {
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
 	fd.LockFD.Init(locks)
-	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
-	fd.inode = d.Impl().(*Dentry).inode
+	fd.inode = d.inode
 	fd.SetDataSource(data)
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index fcee6200a..f8dae22f8 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,7 +15,7 @@
 package kernfs
 
 import (
-	"math"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -28,9 +28,29 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+	// Consider the end of the file to be after the final static entry. This is
+	// the default option.
+	SeekEndStaticEntries = iota
+	// Consider the end of the file to be at offset 0.
+	SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
+type GenericDirectoryFDOptions struct {
+	SeekEnd SeekEndConfig
+}
+
 // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
 //
 // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
 // IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -40,16 +60,21 @@ import (
 // Must be initialize with Init before first use.
 //
 // Lock ordering: mu => children.mu.
+//
+// +stateify savable
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.LockFD
 
+	// Immutable.
+	seekEnd SeekEndConfig
+
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
 
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// off is the current directory offset. Protected by "mu".
 	off int64
@@ -57,12 +82,12 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, locks, opts); err != nil {
+	if err := fd.Init(children, locks, opts, fdOpts); err != nil {
 		return nil, err
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd, nil
@@ -71,12 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
 	fd.LockFD.Init(locks)
+	fd.seekEnd = fdOpts.SeekEnd
 	fd.children = children
 	return nil
 }
@@ -119,8 +145,12 @@ func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
 	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
 }
 
+func (fd *GenericDirectoryFD) dentry() *Dentry {
+	return fd.vfsfd.Dentry().Impl().(*Dentry)
+}
+
 func (fd *GenericDirectoryFD) inode() Inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return fd.dentry().inode
 }
 
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
@@ -150,8 +180,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 	// Handle "..".
 	if fd.off == 1 {
-		vfsd := fd.vfsfd.VirtualDentry().Dentry()
-		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
+		parentInode := genericParentOrSelf(fd.dentry()).inode
 		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
@@ -175,13 +204,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	// these.
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
-		inode := it.Dentry.Impl().(*Dentry).inode
-		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+		stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
 		dirent := vfs.Dirent{
-			Name:    it.Name,
+			Name:    it.name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
@@ -194,7 +222,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 	var err error
 	relOffset := fd.off - int64(len(fd.children.set)) - 2
-	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset)
 	return err
 }
 
@@ -209,9 +237,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	case linux.SEEK_CUR:
 		offset += fd.off
 	case linux.SEEK_END:
-		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
-		// if they are added after SEEK_END.
-		offset = math.MaxInt64
+		switch fd.seekEnd {
+		case SeekEndStaticEntries:
+			fd.children.mu.RLock()
+			offset += int64(len(fd.children.set))
+			offset += 2 // '.' and '..' aren't tracked in children.
+			fd.children.mu.RUnlock()
+		case SeekEndZero:
+			// No-op: offset += 0.
+		default:
+			panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+		}
 	default:
 		return 0, syserror.EINVAL
 	}
@@ -232,8 +268,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
+	return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts)
 }
 
 // Allocate implements vfs.FileDescriptionImpl.Allocate.
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3e5192edd..399895f3e 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,11 +32,12 @@ import (
 //
 // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
-	d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
@@ -53,20 +54,20 @@ afterSymlink:
 	// calls d_revalidate(), but walk_component() => handle_dots() does not.
 	if name == "." {
 		rp.Advance()
-		return vfsd, nil
+		return d, nil
 	}
 	if name == ".." {
-		if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+		if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
 			return nil, err
 		} else if isRoot || d.parent == nil {
 			rp.Advance()
-			return vfsd, nil
+			return d, nil
 		}
-		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+		if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
 			return nil, err
 		}
 		rp.Advance()
-		return &d.parent.vfsd, nil
+		return d.parent, nil
 	}
 	if len(name) > linux.NAME_MAX {
 		return nil, syserror.ENAMETOOLONG
@@ -77,7 +78,7 @@ afterSymlink:
 	if err != nil {
 		return nil, err
 	}
-	if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+	if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
@@ -88,7 +89,7 @@ afterSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -100,15 +101,18 @@ afterSymlink:
 		goto afterSymlink
 	}
 	rp.Advance()
-	return &next.vfsd, nil
+	return next, nil
 }
 
 // revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
 // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
 // nil) to verify that the returned child (or lack thereof) is correct.
 //
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// Cached dentry exists, revalidate.
 		if !child.inode.Valid(ctx) {
 			delete(parent.children, name)
-			vfsObj.InvalidateDentry(ctx, &child.vfsd)
-			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			if child.inode.Keep() {
+				// Drop the ref owned by kernfs.
+				fs.deferDecRef(child)
+			}
+			vfsObj.InvalidateDentry(ctx, child.VFSDentry())
 			child = nil
 		}
 	}
 	if child == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		//
-		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
-		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
-		// that all dentries in the filesystem are (kernfs.)Dentry and performs
-		// vfs.DentryImpl casts accordingly.
-		childVFSD, err := parent.inode.Lookup(ctx, name)
+		// Dentry isn't cached; it either doesn't exist or failed revalidation.
+		// Attempt to resolve it via Lookup.
+		childInode, err := parent.inode.Lookup(ctx, name)
 		if err != nil {
 			return nil, err
 		}
-		// Reference on childVFSD dropped by a corresponding Valid.
-		child = childVFSD.Impl().(*Dentry)
-		parent.insertChildLocked(name, child)
+		var newChild Dentry
+		newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
+		parent.insertChildLocked(name, &newChild)
+		child = &newChild
+
+		// Drop the ref on newChild. This will cause the dentry to get pruned
+		// from the dentry tree by the end of current filesystem operation
+		// (before returning to the VFS layer) if another ref is not picked on
+		// this dentry.
+		if !childInode.Keep() {
+			fs.deferDecRef(&newChild)
+		}
 	}
 	return child, nil
 }
@@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // Preconditions: Filesystem.mu must be locked for at least reading.
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Done() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if rp.MustBeDir() && !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Final() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// directory parent, then returns rp.Component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return "", err
 	}
 	pc := rp.Component()
@@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if len(pc) > linux.NAME_MAX {
 		return "", syserror.ENAMETOOLONG
 	}
-	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
-	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+	if _, ok := parent.children[pc]; ok {
 		return "", syserror.EEXIST
 	}
-	if parentVFSD.IsDead() {
+	if parent.VFSDentry().IsDead() {
 		return "", syserror.ENOENT
 	}
 	return pc, nil
@@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 // checkDeleteLocked checks that the file represented by vfsd may be deleted.
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
-	parent := vfsd.Impl().(*Dentry).parent
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
+	parent := d.parent
 	if parent == nil {
 		return syserror.EBUSY
 	}
@@ -249,11 +260,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
 
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	return inode.CheckPermissions(ctx, creds, ats)
+	return d.inode.CheckPermissions(ctx, creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -261,20 +272,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 
 	if opts.CheckSearchable {
-		d := vfsd.Impl().(*Dentry)
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
+	vfsd := d.VFSDentry()
 	vfsd.IncRef() // Ownership transferred to caller.
 	return vfsd, nil
 }
@@ -284,12 +295,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	d, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	vfsd.IncRef() // Ownership transferred to caller.
-	return vfsd, nil
+	d.IncRef() // Ownership transferred to caller.
+	return d.VFSDentry(), nil
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EPERM
 	}
 
-	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+	childI, err := parent.inode.NewLink(ctx, pc, d.inode)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+	childI, err := parent.inode.NewDir(ctx, pc, opts)
 	if err != nil {
-		return err
+		if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+			return err
+		}
+		childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+	newI, err := parent.inode.NewNode(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+	var newD Dentry
+	newD.Init(fs, newI)
+	parent.insertChildLocked(pc, &newD)
 	return nil
 }
 
@@ -397,28 +426,28 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
-		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		defer fs.processDeferredDecRefs(ctx)
+		d, err := fs.walkExistingLocked(ctx, rp)
 		if err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		fs.mu.RUnlock()
-		fs.processDeferredDecRefs(ctx)
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 
 	// May create new file.
 	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	d := rp.Start().Impl().(*Dentry)
 	fs.mu.Lock()
 	unlocked := false
 	unlock := func() {
@@ -427,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 			unlocked = true
 		}
 	}
+	// Process all to-be-decref'd dentries at the end at once.
+	// Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
+	// when this is executed.
+	defer fs.processDeferredDecRefs(ctx)
 	defer unlock()
 	if rp.Done() {
 		if rp.MustBeDir() {
@@ -435,22 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		unlock()
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 afterTrailingSymlink:
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -465,10 +500,10 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+	child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -476,16 +511,20 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+		childI, err := parent.inode.NewFile(ctx, pc, opts)
 		if err != nil {
 			return nil, err
 		}
-		child := childVFSD.Impl().(*Dentry)
-		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		child.inode.IncRef()
-		defer child.inode.DecRef(ctx)
+		var child Dentry
+		child.Init(fs, childI)
+		parent.insertChild(pc, &child)
+		// Open may block so we need to unlock fs.mu. IncRef child to prevent
+		// its destruction while fs.mu is unlocked.
+		child.IncRef()
 		unlock()
-		return child.inode.Open(ctx, rp, childVFSD, opts)
+		fd, err := child.inode.Open(ctx, rp, &child, opts)
+		child.DecRef(ctx)
+		return fd, err
 	}
 	if err != nil {
 		return nil, err
@@ -494,7 +533,6 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	child := childVFSD.Impl().(*Dentry)
 	if rp.ShouldFollowSymlink() && child.isSymlink() {
 		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
@@ -502,7 +540,7 @@ afterTrailingSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -518,25 +556,28 @@ afterTrailingSymlink:
 	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	child.inode.IncRef()
-	defer child.inode.DecRef(ctx)
+	// Open may block so we need to unlock fs.mu. IncRef child to prevent
+	// its destruction while fs.mu is unlocked.
+	child.IncRef()
 	unlock()
-	return child.inode.Open(ctx, rp, &child.vfsd, opts)
+	fd, err := child.inode.Open(ctx, rp, child, opts)
+	child.DecRef(ctx)
+	return fd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
-	d, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	if !d.Impl().(*Dentry).isSymlink() {
+	if !d.isSymlink() {
 		return "", syserror.EINVAL
 	}
-	return inode.Readlink(ctx)
+	return d.inode.Readlink(ctx, rp.Mount())
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -548,16 +589,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
-	defer fs.processDeferredDecRefsLocked(ctx)
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
 
 	// Resolve the destination directory first to verify that it's on this
 	// Mount.
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	dstDir, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	dstDir := dstDirVFSD.Impl().(*Dentry)
 	mnt := rp.Mount()
 	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
@@ -575,16 +615,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if err != nil {
 		return err
 	}
-	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
-	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+	if err := checkDeleteLocked(ctx, rp, src); err != nil {
 		return err
 	}
 
 	// Can we create the dst dentry?
 	var dst *Dentry
-	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+	pc, err := checkCreateLocked(ctx, rp, dstDir)
 	switch err {
 	case nil:
 		// Ok, continue with rename as replacement.
@@ -595,14 +634,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 		dst = dstDir.children[pc]
 		if dst == nil {
-			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
 		}
 	default:
 		return err
 	}
 	var dstVFSD *vfs.Dentry
 	if dst != nil {
-		dstVFSD = &dst.vfsd
+		dstVFSD = dst.VFSDentry()
 	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
@@ -618,35 +657,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		defer dstDir.dirMu.Unlock()
 	}
 
+	srcVFSD := src.VFSDentry()
 	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
 		return err
 	}
-	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+	err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode)
 	if err != nil {
 		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
 		return err
 	}
 	delete(srcDir.children, src.name)
 	if srcDir != dstDir {
-		fs.deferDecRef(srcDirVFSD)
-		dstDir.IncRef()
+		fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
+		dstDir.IncRef()        // child (src) takes a ref on the new parent.
 	}
 	src.parent = dstDir
 	src.name = pc
 	if dstDir.children == nil {
 		dstDir.children = make(map[string]*Dentry)
 	}
+	replaced := dstDir.children[pc]
 	dstDir.children[pc] = src
-	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+	var replaceVFSD *vfs.Dentry
+	if replaced != nil {
+		// deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
+		fs.deferDecRef(replaced)
+		replaceVFSD = replaced.VFSDentry()
+	}
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
 	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -654,14 +702,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
 		return syserror.ENOTDIR
 	}
-	if inode.HasChildren() {
+	if d.inode.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
 	virtfs := rp.VirtualFilesystem()
@@ -671,13 +718,18 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+	if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
@@ -685,41 +737,40 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+	return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+	return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issue/1193): actually implement statfs.
-	return linux.Statfs{}, syserror.ENOSYS
+	return d.inode.StatFS(ctx, fs.VFSFilesystem())
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -728,13 +779,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -742,20 +796,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+	childI, err := parent.inode.NewSymlink(ctx, pc, target)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -763,10 +820,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if d.isDir() {
 		return syserror.EISDIR
 	}
@@ -776,38 +832,42 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	defer parentDentry.dirMu.Unlock()
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+	if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
@@ -815,12 +875,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
@@ -828,12 +888,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -841,12 +901,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -860,3 +920,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.mu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
 }
+
+func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
+	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+		// The following is equivalent to vd.DecRef(ctx). This is needed
+		// because if d belongs to this filesystem, we can not DecRef it right
+		// away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
+		// defer the DecRef to when locks are dropped.
+		vd.Mount().DecRef(ctx)
+		fs.deferDecRef(d)
+	} else {
+		vd.DecRef(ctx)
+	}
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index fe8a1e710..d9d76758a 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,11 +20,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -32,7 +33,10 @@ import (
 // count for inodes, performing no extra actions when references are obtained or
 // released. This is suitable for simple file inodes that don't reference any
 // resources.
+//
+// +stateify savable
 type InodeNoopRefCount struct {
+	InodeTemporary
 }
 
 // IncRef implements Inode.IncRef.
@@ -51,30 +55,32 @@ func (InodeNoopRefCount) TryIncRef() bool {
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
 // InodeDirectoryNoNewChildren represents a directory inode which does not
 // support creation of new children.
+//
+// +stateify savable
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -82,7 +88,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
 // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
 // represent directories can embed this to provide no-op implementations for
 // directory-related functions.
+//
+// +stateify savable
 type InodeNotDirectory struct {
+	InodeAlwaysValid
 }
 
 // HasChildren implements Inode.HasChildren.
@@ -91,89 +100,64 @@ func (InodeNotDirectory) HasChildren() bool {
 }
 
 // NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
 	panic("Lookup called on non-directory inode")
 }
 
 // IterDirents implements Inode.IterDirents.
-func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
 	panic("IterDirents called on non-directory inode")
 }
 
-// Valid implements Inode.Valid.
-func (InodeNotDirectory) Valid(context.Context) bool {
-	return true
-}
-
-// InodeNoDynamicLookup partially implements the Inode interface, specifically
-// the inodeDynamicLookup sub interface. Directory inodes that do not support
-// dymanic entries (i.e. entries that are not "hashed" into the
-// vfs.Dentry.children) can embed this to provide no-op implementations for
-// functions related to dynamic entries.
-type InodeNoDynamicLookup struct{}
-
-// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	return nil, syserror.ENOENT
-}
-
-// IterDirents implements Inode.IterDirents.
-func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
-	return offset, nil
-}
-
-// Valid implements Inode.Valid.
-func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
-	return true
-}
-
 // InodeNotSymlink partially implements the Inode interface, specifically the
 // inodeSymlink sub interface. All inodes that are not symlinks may embed this
 // to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
 	return "", syserror.EINVAL
 }
 
@@ -187,18 +171,26 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
 // inode attributes.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type InodeAttrs struct {
-	devMajor uint32
-	devMinor uint32
-	ino      uint64
-	mode     uint32
-	uid      uint32
-	gid      uint32
-	nlink    uint32
+	devMajor  uint32
+	devMinor  uint32
+	ino       uint64
+	mode      uint32
+	uid       uint32
+	gid       uint32
+	nlink     uint32
+	blockSize uint32
+
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
 }
 
 // Init initializes this InodeAttrs.
-func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
+func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
 	}
@@ -214,6 +206,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, in
 	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
 	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
 	atomic.StoreUint32(&a.nlink, nlink)
+	atomic.StoreUint32(&a.blockSize, usermem.PageSize)
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	atomic.StoreInt64(&a.atime, now)
+	atomic.StoreInt64(&a.mtime, now)
+	atomic.StoreInt64(&a.ctime, now)
 }
 
 // DevMajor returns the device major number.
@@ -236,12 +233,33 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 	return linux.FileMode(atomic.LoadUint32(&a.mode))
 }
 
+// TouchAtime updates a.atime to the current time.
+func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	atomic.StoreInt64(&a.atime, ktime.NowFromContext(ctx).Nanoseconds())
+	mnt.EndWrite()
+}
+
+// TouchCMtime updates a.{c/m}time to the current time. The caller should
+// synchronize calls to this so that ctime and mtime are updated to the same
+// value.
+func (a *InodeAttrs) TouchCMtime(ctx context.Context) {
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	atomic.StoreInt64(&a.mtime, now)
+	atomic.StoreInt64(&a.ctime, now)
+}
+
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
 func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME
 	stat.DevMajor = a.devMajor
 	stat.DevMinor = a.devMinor
 	stat.Ino = atomic.LoadUint64(&a.ino)
@@ -249,9 +267,10 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
 	stat.UID = atomic.LoadUint32(&a.uid)
 	stat.GID = atomic.LoadUint32(&a.gid)
 	stat.Nlink = atomic.LoadUint32(&a.nlink)
-
-	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-
+	stat.Blksize = atomic.LoadUint32(&a.blockSize)
+	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.atime))
+	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.mtime))
+	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.ctime))
 	return stat, nil
 }
 
@@ -260,9 +279,17 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation. Setting the size is often
+	// allowed by kernfs files but does not do anything. If some other behavior is
+	// needed, the embedder should consider extending SetStat.
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+		return syserror.EISDIR
+	}
 	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
@@ -285,12 +312,19 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 		atomic.StoreUint32(&a.gid, stat.GID)
 	}
 
-	// Note that not all fields are modifiable. For example, the file type and
-	// inode numbers are immutable after node creation.
-
-	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-	// Also, STATX_SIZE will need some special handling, because read-only static
-	// files should return EIO for truncate operations.
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	if stat.Mask&linux.STATX_ATIME != 0 {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			stat.Atime = linux.NsecToStatxTimestamp(now)
+		}
+		atomic.StoreInt64(&a.atime, stat.Atime.ToNsec())
+	}
+	if stat.Mask&linux.STATX_MTIME != 0 {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			stat.Mtime = linux.NsecToStatxTimestamp(now)
+		}
+		atomic.StoreInt64(&a.mtime, stat.Mtime.ToNsec())
+	}
 
 	return nil
 }
@@ -321,13 +355,17 @@ func (a *InodeAttrs) DecLinks() {
 	}
 }
 
+// +stateify savable
 type slot struct {
-	Name   string
-	Dentry *vfs.Dentry
+	name   string
+	inode  Inode
+	static bool
 	slotEntry
 }
 
 // OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
 type OrderedChildrenOptions struct {
 	// Writable indicates whether vfs.FilesystemImpl methods implemented by
 	// OrderedChildren may modify the tracked children. This applies to
@@ -337,20 +375,28 @@ type OrderedChildrenOptions struct {
 }
 
 // OrderedChildren partially implements the Inode interface. OrderedChildren can
-// be embedded in directory inodes to keep track of the children in the
+// be embedded in directory inodes to keep track of children in the
 // directory, and can then be used to implement a generic directory FD -- see
-// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
-// directories.
+// GenericDirectoryFD.
+//
+// OrderedChildren can represent a node in an Inode tree. The children inodes
+// might be directories themselves using OrderedChildren; hence extending the
+// tree. The parent inode (OrderedChildren user) holds a ref on all its static
+// children. This lets the static inodes outlive their associated dentry.
+// While the dentry might have to be regenerated via a Lookup() call, we can
+// keep reusing the same static inode. These static children inodes are finally
+// DecRef'd when this directory inode is being destroyed. This makes
+// OrderedChildren suitable for static directory entries as well.
 //
 // Must be initialize with Init before first use.
+//
+// +stateify savable
 type OrderedChildren struct {
-	refs.AtomicRefCount
-
 	// Can children be modified by user syscalls? It set to false, interface
 	// methods that would modify the children return EPERM. Immutable.
 	writable bool
 
-	mu    sync.RWMutex
+	mu    sync.RWMutex `state:"nosave"`
 	order slotList
 	set   map[string]*slot
 }
@@ -361,36 +407,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
 	o.set = make(map[string]*slot)
 }
 
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef(ctx context.Context) {
-	o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		o.mu.Lock()
-		defer o.mu.Unlock()
-		o.order.Reset()
-		o.set = nil
-	})
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy(ctx context.Context) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	// Drop the ref that o owns on the static inodes it holds.
+	for _, s := range o.set {
+		if s.static {
+			s.inode.DecRef(ctx)
+		}
+	}
+	o.order.Reset()
+	o.set = nil
 }
 
-// Populate inserts children into this OrderedChildren, and d's dentry
-// cache. Populate returns the number of directories inserted, which the caller
+// Populate inserts static children into this OrderedChildren.
+// Populate returns the number of directories inserted, which the caller
 // may use to update the link count for the parent directory.
 //
-// Precondition: d must represent a directory inode. children must not contain
-// any conflicting entries already in o.
-func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+// Precondition:
+//   * d must represent a directory inode.
+//   * children must not contain any conflicting entries already in o.
+//   * Caller must hold a reference on all inodes passed.
+//
+// Postcondition: Caller's references on inodes are transferred to o.
+func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
 	var links uint32
 	for name, child := range children {
-		if child.isDir() {
+		if child.Mode().IsDir() {
 			links++
 		}
-		if err := o.Insert(name, child.VFSDentry()); err != nil {
-			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		if err := o.insert(name, child, true); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
 		}
-		d.InsertChild(name, child)
 	}
 	return links
 }
 
+// Lookup implements Inode.Lookup.
+func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+
+	s, ok := o.set[name]
+	if !ok {
+		return nil, syserror.ENOENT
+	}
+
+	s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
+	return s.inode, nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	// All entries from OrderedChildren have already been handled in
+	// GenericDirectoryFD.IterDirents.
+	return offset, nil
+}
+
 // HasChildren implements Inode.HasChildren.
 func (o *OrderedChildren) HasChildren() bool {
 	o.mu.RLock()
@@ -398,17 +474,27 @@ func (o *OrderedChildren) HasChildren() bool {
 	return len(o.set) > 0
 }
 
-// Insert inserts child into o. This ignores the writability of o, as this is
-// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+// Insert inserts a dynamic child into o. This ignores the writability of o, as
+// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child Inode) error {
+	return o.insert(name, child, false)
+}
+
+// insert inserts child into o.
+//
+// Precondition: Caller must be holding a ref on child if static is true.
+//
+// Postcondition: Caller's ref on child is transferred to o if static is true.
+func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
 	o.mu.Lock()
 	defer o.mu.Unlock()
 	if _, ok := o.set[name]; ok {
 		return syserror.EEXIST
 	}
 	s := &slot{
-		Name:   name,
-		Dentry: child,
+		name:   name,
+		inode:  child,
+		static: static,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
@@ -418,44 +504,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
 // Precondition: caller must hold o.mu for writing.
 func (o *OrderedChildren) removeLocked(name string) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
+		}
 		delete(o.set, name)
 		o.order.Remove(s)
 	}
 }
 
 // Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
+		}
+
 		// Existing slot with given name, simply replace the dentry.
-		var old *vfs.Dentry
-		old, s.Dentry = s.Dentry, new
-		return old
+		s.inode = newI
 	}
 
 	// No existing slot with given name, create and hash new slot.
 	s := &slot{
-		Name:   name,
-		Dentry: new,
+		name:   name,
+		inode:  newI,
+		static: false,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
-	return nil
 }
 
 // Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
 	s, ok := o.set[name]
 	if !ok {
 		return syserror.ENOENT
 	}
-	if s.Dentry != child {
-		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	if s.inode != child {
+		panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
 	}
 	return nil
 }
 
 // Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
 	if !o.writable {
 		return syserror.EPERM
 	}
@@ -470,13 +561,14 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
 	return nil
 }
 
-// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+// RmDir implements Inode.RmDir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
 	// We're not responsible for checking that child is a directory, that it's
 	// empty, or updating any link counts; so this is the same as unlink.
 	return o.Unlink(ctx, name, child)
 }
 
+// +stateify savable
 type renameAcrossDifferentImplementationsError struct{}
 
 func (renameAcrossDifferentImplementationsError) Error() string {
@@ -492,13 +584,13 @@ func (renameAcrossDifferentImplementationsError) Error() string {
 // that will support Rename.
 //
 // Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
-	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+	dst, ok := dstDir.(interface{}).(*OrderedChildren)
 	if !ok {
-		return nil, renameAcrossDifferentImplementationsError{}
+		return renameAcrossDifferentImplementationsError{}
 	}
 	if !o.writable || !dst.writable {
-		return nil, syserror.EPERM
+		return syserror.EPERM
 	}
 	// Note: There's a potential deadlock below if concurrent calls to Rename
 	// refer to the same src and dst directories in reverse. We avoid any
@@ -511,12 +603,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
 		defer dst.mu.Unlock()
 	}
 	if err := o.checkExistingLocked(oldname, child); err != nil {
-		return nil, err
+		return err
 	}
 
 	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
-	replaced := dst.replaceChildLocked(newname, child)
-	return replaced, nil
+	dst.replaceChildLocked(ctx, newname, child)
+	return nil
 }
 
 // nthLocked returns an iterator to the nth child tracked by this object. The
@@ -535,12 +627,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
 }
 
 // InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
 type InodeSymlink struct {
 	InodeNotDirectory
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -549,43 +643,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 //
 // +stateify savable
 type StaticDirectory struct {
-	InodeNotSymlink
-	InodeDirectoryNoNewChildren
+	InodeAlwaysValid
 	InodeAttrs
-	InodeNoDynamicLookup
+	InodeDirectoryNoNewChildren
+	InodeNoStatFS
+	InodeNotSymlink
+	InodeTemporary
 	OrderedChildren
+	StaticDirectoryRefs
 
-	locks vfs.FileLocks
+	locks  vfs.FileLocks
+	fdOpts GenericDirectoryFDOptions
 }
 
 var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
 	inode := &StaticDirectory{}
-	inode.Init(creds, devMajor, devMinor, ino, perm)
-
-	dentry := &Dentry{}
-	dentry.Init(inode)
+	inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, children)
+	links := inode.OrderedChildren.Populate(children)
 	inode.IncLinks(links)
 
-	return dentry
+	return inode
 }
 
 // Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
+	s.fdOpts = fdOpts
+	s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
 }
 
-// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+// Open implements Inode.Open.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
 	if err != nil {
 		return nil, err
 	}
@@ -597,10 +694,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
 	return syserror.EPERM
 }
 
-// AlwaysValid partially implements kernfs.inodeDynamicLookup.
-type AlwaysValid struct{}
+// DecRef implements Inode.DecRef.
+func (s *StaticDirectory) DecRef(ctx context.Context) {
+	s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
+}
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (*AlwaysValid) Valid(context.Context) bool {
+// InodeAlwaysValid partially implements Inode.
+//
+// +stateify savable
+type InodeAlwaysValid struct{}
+
+// Valid implements Inode.Valid.
+func (*InodeAlwaysValid) Valid(context.Context) bool {
 	return true
 }
+
+// InodeTemporary partially implements Inode.
+//
+// +stateify savable
+type InodeTemporary struct{}
+
+// Keep implements Inode.Keep.
+func (*InodeTemporary) Keep() bool {
+	return false
+}
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+//
+// +stateify savable
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 51dbc050c..5c5e09ac5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,12 +29,16 @@
 //
 // Reference Model:
 //
-// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// Kernfs dentries represents named pointers to inodes. Kernfs is solely
+// reponsible for maintaining and modifying its dentry tree; inode
+// implementations can not access the tree. Dentries and inodes have
 // independent lifetimes and reference counts. A child dentry unconditionally
 // holds a reference on its parent directory's dentry. A dentry also holds a
-// reference on the inode it points to. Multiple dentries can point to the same
-// inode (for example, in the case of hardlinks). File descriptors hold a
-// reference to the dentry they're opened on.
+// reference on the inode it points to (although that might not be the only
+// reference on the inode). Due to this inodes can outlive the dentries that
+// point to them. Multiple dentries can point to the same inode (for example,
+// in the case of hardlinks). File descriptors hold a reference to the dentry
+// they're opened on.
 //
 // Dentries are guaranteed to exist while holding Filesystem.mu for
 // reading. Dropping dentries require holding Filesystem.mu for writing. To
@@ -47,8 +51,8 @@
 //   kernfs.Dentry.dirMu
 //     vfs.VirtualFilesystem.mountMu
 //       vfs.Dentry.mu
-//   kernfs.Filesystem.droppedDentriesMu
 //   (inode implementation locks, if any)
+// kernfs.Filesystem.droppedDentriesMu
 package kernfs
 
 import (
@@ -57,7 +61,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -66,15 +69,17 @@ import (
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
 // filesystem. Concrete implementations are expected to embed this in their own
 // Filesystem type.
+//
+// +stateify savable
 type Filesystem struct {
 	vfsfs vfs.Filesystem
 
-	droppedDentriesMu sync.Mutex
+	droppedDentriesMu sync.Mutex `state:"nosave"`
 
 	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
 	// used to defer dentry destruction until mu can be acquired for
 	// writing. Protected by droppedDentriesMu.
-	droppedDentries []*vfs.Dentry
+	droppedDentries []*Dentry
 
 	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
 	// for reading guarantees continued existence of any resolved dentries, but
@@ -93,22 +98,32 @@ type Filesystem struct {
 	// example:
 	//
 	//   fs.mu.RLock()
-	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.processDeferredDecRefs()
 	//   defer fs.mu.RUnlock()
 	//   ...
 	//   fs.deferDecRef(dentry)
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// nextInoMinusOne is used to to allocate inode numbers on this
 	// filesystem. Must be accessed by atomic operations.
 	nextInoMinusOne uint64
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by mu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// MaxCachedDentries is the maximum size of cachedDentries. If not set,
+	// defaults to 0 and kernfs does not cache any dentries. This is immutable.
+	MaxCachedDentries uint64
 }
 
 // deferDecRef defers dropping a dentry ref until the next call to
 // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
-//
-// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+// This may be called while Filesystem.mu or Dentry.dirMu is locked.
+func (fs *Filesystem) deferDecRef(d *Dentry) {
 	fs.droppedDentriesMu.Lock()
 	fs.droppedDentries = append(fs.droppedDentries, d)
 	fs.droppedDentriesMu.Unlock()
@@ -116,17 +131,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
 
 // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
 // droppedDentries list. See comment on Filesystem.mu.
+//
+// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
 func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
-	fs.mu.Lock()
-	fs.processDeferredDecRefsLocked(ctx)
-	fs.mu.Unlock()
-}
-
-// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
 	fs.droppedDentriesMu.Lock()
 	for _, d := range fs.droppedDentries {
-		d.DecRef(ctx)
+		// Defer the DecRef call so that we are not holding droppedDentriesMu
+		// when DecRef is called.
+		defer d.DecRef(ctx)
 	}
 	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
 	fs.droppedDentriesMu.Unlock()
@@ -155,15 +167,24 @@ const (
 //
 // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
 // named reference to an inode. A dentry generally lives as long as it's part of
-// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
-// to them are removed. Dentries hold a single reference to the inode they point
+// a mounted filesystem tree. Kernfs drops dentries once all references to them
+// are dropped. Dentries hold a single reference to the inode they point
 // to, and child dentries hold a reference on their parent.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type Dentry struct {
 	vfsd vfs.Dentry
 
-	refs.AtomicRefCount
+	// refs is the reference count. When refs reaches 0, the dentry may be
+	// added to the cache or destroyed. If refs == -1, the dentry has already
+	// been destroyed. refs are allowed to go to 0 and increase again. refs is
+	// accessed using atomic memory operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *Filesystem
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
@@ -172,21 +193,177 @@ type Dentry struct {
 	parent *Dentry
 	name   string
 
+	// If cached is true, dentryEntry links dentry into
+	// Filesystem.cachedDentries. cached and dentryEntry are protected by
+	// Filesystem.mu.
+	cached bool
+	dentryEntry
+
 	// dirMu protects children and the names of child Dentries.
-	dirMu    sync.Mutex
+	//
+	// Note that holding fs.mu for writing is not sufficient;
+	// revalidateChildLocked(), which is a very hot path, may modify children with
+	// fs.mu acquired for reading only.
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*Dentry
 
 	inode Inode
 }
 
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef() {
+	// d.refs may be 0 if d.fs.mu is locked, which serializes against
+	// d.cacheLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.mu.Lock()
+		d.cacheLocked(ctx)
+		d.fs.mu.Unlock()
+	} else if refs < 0 {
+		panic("kernfs.Dentry.DecRef() called without holding a reference")
+	}
+}
+
+// cacheLocked should be called after d's reference count becomes 0. The ref
+// count check may happen before acquiring d.fs.mu so there might be a race
+// condition where the ref count is increased again by the time the caller
+// acquires d.fs.mu. This race is handled.
+// Only reachable dentries are added to the cache. However, a dentry might
+// become unreachable *while* it is in the cache due to invalidation.
+//
+// Preconditions: d.fs.mu must be locked for writing.
+func (d *Dentry) cacheLocked(ctx context.Context) {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires d.fs.mu, so if d.refs is zero then it will
+	// remain zero while we hold d.fs.mu for writing.)
+	refs := atomic.LoadInt64(&d.refs)
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d))
+	}
+	if refs > 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	// If the dentry is deleted and invalidated or has no parent, then it is no
+	// longer reachable by path resolution and should be dropped immediately
+	// because it has zero references.
+	// Note that a dentry may not always have a parent; for example magic links
+	// as described in Inode.Getlink.
+	if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil {
+		if !isDead {
+			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
+		}
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked(ctx)
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
+		return
+	}
+	// Evict the least recently used dentry because cache size is greater than
+	// max cache size (configured on mount).
+	victim := d.fs.cachedDentries.Back()
+	d.fs.cachedDentries.Remove(victim)
+	d.fs.cachedDentriesLen--
+	victim.cached = false
+	// victim.refs may have become non-zero from an earlier path resolution
+	// after it was inserted into fs.cachedDentries.
+	if atomic.LoadInt64(&victim.refs) == 0 {
+		if !victim.vfsd.IsDead() {
+			victim.parent.dirMu.Lock()
+			// Note that victim can't be a mount point (in any mount
+			// namespace), since VFS holds references on mount points.
+			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry())
+			delete(victim.parent.children, victim.name)
+			victim.parent.dirMu.Unlock()
+		}
+		victim.destroyLocked(ctx)
+	}
+	// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+	// back down to fs.MaxCachedDentries, so we don't loop.
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.mu must be locked for writing.
+// * d.refs == 0.
+// * d should have been removed from d.parent.children, i.e. d is not reachable
+//   by path traversal.
+// * d.vfsd.IsDead() is true.
+func (d *Dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
+	d.inode.DecRef(ctx) // IncRef from Init.
+	d.inode = nil
+
+	// Drop the reference held by d on its parent without recursively locking
+	// d.fs.mu.
+	if d.parent != nil {
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.cacheLocked(ctx)
+		} else if refs < 0 {
+			panic("kernfs.Dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
 // Init initializes this dentry.
 //
 // Precondition: Caller must hold a reference on inode.
 //
 // Postcondition: Caller's reference on inode is transferred to the dentry.
-func (d *Dentry) Init(inode Inode) {
+func (d *Dentry) Init(fs *Filesystem, inode Inode) {
 	d.vfsd.Init(d)
+	d.fs = fs
 	d.inode = inode
+	atomic.StoreInt64(&d.refs, 1)
 	ftype := inode.Mode().FileType()
 	if ftype == linux.ModeDirectory {
 		d.flags |= dflagsIsDir
@@ -211,20 +388,6 @@ func (d *Dentry) isSymlink() bool {
 	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
 }
 
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(ctx context.Context) {
-	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy(ctx context.Context) {
-	d.inode.DecRef(ctx) // IncRef from Init.
-	d.inode = nil
-	if d.parent != nil {
-		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
-	}
-}
-
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 //
 // Although Linux technically supports inotify on pseudo filesystems (inotify
@@ -240,25 +403,29 @@ func (d *Dentry) Watches() *vfs.Watches {
 // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
 func (d *Dentry) OnZeroWatches(context.Context) {}
 
-// InsertChild inserts child into the vfs dentry cache with the given name under
-// this dentry. This does not update the directory inode, so calling this on
-// its own isn't sufficient to insert a child into a directory. InsertChild
-// updates the link count on d if required.
+// insertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on its
+// own isn't sufficient to insert a child into a directory.
 //
-// Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *Dentry) {
+// Preconditions:
+// * d must represent a directory inode.
+// * d.fs.mu must be locked for at least reading.
+func (d *Dentry) insertChild(name string, child *Dentry) {
 	d.dirMu.Lock()
 	d.insertChildLocked(name, child)
 	d.dirMu.Unlock()
 }
 
-// insertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to insertChild, with additional
 // preconditions.
 //
-// Precondition: d.dirMu must be locked.
+// Preconditions:
+// * d must represent a directory inode.
+// * d.dirMu must be locked.
+// * d.fs.mu must be locked for at least reading.
 func (d *Dentry) insertChildLocked(name string, child *Dentry) {
 	if !d.isDir() {
-		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+		panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
 	}
 	d.IncRef() // DecRef in child's Dentry.destroy.
 	child.parent = d
@@ -289,7 +456,6 @@ func (d *Dentry) Inode() Inode {
 //
 // - Checking that dentries passed to methods are of the appropriate file type.
 // - Checking permissions.
-// - Updating link and reference counts.
 //
 // Specific responsibilities of implementations are documented below.
 type Inode interface {
@@ -299,7 +465,8 @@ type Inode interface {
 	inodeRefs
 
 	// Methods related to node metadata. A generic implementation is provided by
-	// InodeAttrs.
+	// InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
+	// managing link counts.
 	inodeMetadata
 
 	// Method for inodes that represent symlink. InodeNotSymlink provides a
@@ -310,18 +477,26 @@ type Inode interface {
 	// a blanket implementation for all non-directory inodes.
 	inodeDirectory
 
-	// Method for inodes that represent dynamic directories and their
-	// children. InodeNoDynamicLookup provides a blanket implementation for all
-	// non-dynamic-directory inodes.
-	inodeDynamicLookup
-
 	// Open creates a file description for the filesystem object represented by
 	// this inode. The returned file description should hold a reference on the
-	// inode for its lifetime.
+	// dentry for its lifetime.
 	//
 	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
 	// the inode on which Open() is being called.
-	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+	Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+	// StatFS returns filesystem statistics for the client filesystem. This
+	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+	// doesn't support statfs(2), this should return ENOSYS.
+	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
+
+	// Keep indicates whether the dentry created after Inode.Lookup should be
+	// kept in the kernfs dentry tree.
+	Keep() bool
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
 }
 
 type inodeRefs interface {
@@ -354,8 +529,8 @@ type inodeMetadata interface {
 // Precondition: All methods in this interface may only be called on directory
 // inodes.
 type inodeDirectory interface {
-	// The New{File,Dir,Node,Symlink} methods below should return a new inode
-	// hashed into this inode.
+	// The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
+	// that will be hashed into the dentry tree.
 	//
 	// These inode constructors are inode-level operations rather than
 	// filesystem-level operations to allow client filesystems to mix different
@@ -366,75 +541,69 @@ type inodeDirectory interface {
 	HasChildren() bool
 
 	// NewFile creates a new regular file inode.
-	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
 
 	// NewDir creates a new directory inode.
-	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
 
 	// NewLink creates a new hardlink to a specified inode in this
 	// directory. Implementations should create a new kernfs Dentry pointing to
 	// target, and update target's link count.
-	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+	NewLink(ctx context.Context, name string, target Inode) (Inode, error)
 
 	// NewSymlink creates a new symbolic link inode.
-	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+	NewSymlink(ctx context.Context, name, target string) (Inode, error)
 
 	// NewNode creates a new filesystem node for a mknod syscall.
-	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
 
 	// Unlink removes a child dentry from this directory inode.
-	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+	Unlink(ctx context.Context, name string, child Inode) error
 
 	// RmDir removes an empty child directory from this directory
 	// inode. Implementations must update the parent directory's link count,
 	// if required. Implementations are not responsible for checking that child
 	// is a directory, checking for an empty directory.
-	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+	RmDir(ctx context.Context, name string, child Inode) error
 
 	// Rename is called on the source directory containing an inode being
 	// renamed. child should point to the resolved child in the source
-	// directory. If Rename replaces a dentry in the destination directory, it
-	// should return the replaced dentry or nil otherwise.
+	// directory.
 	//
 	// Precondition: Caller must serialize concurrent calls to Rename.
-	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
-}
+	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
 
-type inodeDynamicLookup interface {
-	// Lookup should return an appropriate dentry if name should resolve to a
-	// child of this dynamic directory inode. This gives the directory an
-	// opportunity on every lookup to resolve additional entries that aren't
-	// hashed into the directory. This is only called when the inode is a
-	// directory. If the inode is not a directory, or if the directory only
-	// contains a static set of children, the implementer can unconditionally
-	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	// Lookup should return an appropriate inode if name should resolve to a
+	// child of this directory inode. This gives the directory an opportunity
+	// on every lookup to resolve additional entries. This is only called when
+	// the inode is a directory.
 	//
-	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
-	// lifetime can be controlled by the filesystem implementation with an
-	// appropriate implementation of Valid.
+	// The child returned by Lookup will be hashed into the VFS dentry tree,
+	// at least for the duration of the current FS operation.
 	//
-	// Lookup returns the child with an extra reference and the caller owns this
-	// reference.
-	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
-
-	// Valid should return true if this inode is still valid, or needs to
-	// be resolved again by a call to Lookup.
-	Valid(ctx context.Context) bool
+	// Lookup must return the child with an extra reference whose ownership is
+	// transferred to the dentry that is created to point to that inode. If
+	// Inode.Keep returns false, that new dentry will be dropped at the end of
+	// the current filesystem operation (before returning back to the VFS
+	// layer) if no other ref is picked on that dentry. If Inode.Keep returns
+	// true, then the dentry will be cached into the dentry tree until it is
+	// Unlink'd or RmDir'd.
+	Lookup(ctx context.Context, name string) (Inode, error)
 
 	// IterDirents is used to iterate over dynamically created entries. It invokes
-	// cb on each entry in the directory represented by the FileDescription.
+	// cb on each entry in the directory represented by the Inode.
 	// 'offset' is the offset for the entire IterDirents call, which may include
 	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
 	// inside the entries returned by this IterDirents invocation. In other words,
 	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
 	// the return value, while 'relOffset' is the place to start iteration.
-	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+	IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
 type inodeSymlink interface {
 	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
-	Readlink(ctx context.Context) (string, error)
+	Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
 
 	// Getlink returns the target of a symbolic link, as used by path
 	// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c5d5afedf..2418eec44 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file."
 
 // RootDentryFn is a generator function for creating the root dentry of a test
 // filesystem. See newTestSystem.
-type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+type RootDentryFn func(context.Context, *auth.Credentials, *filesystem) kernfs.Inode
 
 // newTestSystem sets up a minimal environment for running a test, including an
 // instance of a test filesystem. Tests can control the contents of the
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
 	}
@@ -72,14 +72,11 @@ type file struct {
 	content string
 }
 
-func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+func (fs *filesystem) newFile(ctx context.Context, creds *auth.Credentials, content string) kernfs.Inode {
 	f := &file{}
 	f.content = content
-	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(f)
-	return d
+	f.DynamicBytesFile.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
+	return f
 }
 
 func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
@@ -96,100 +93,112 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
 }
 
 type readonlyDir struct {
+	readonlyDirRefs
 	attrs
-	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAlwaysValid
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newReadonlyDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &readonlyDir{}
-	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	dir.dentry.Init(dir)
-
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.EnableLeakCheck()
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+func (d *readonlyDir) DecRef(ctx context.Context) {
+	d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
 type dir struct {
+	dirRefs
 	attrs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNoStatFS
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
 
-	fs     *filesystem
-	dentry kernfs.Dentry
+	fs *filesystem
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &dir{}
 	dir.fs = fs
-	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
-	dir.dentry.Init(dir)
+	dir.EnableLeakCheck()
 
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
-	dir := d.fs.newDir(creds, opts.Mode, nil)
-	dirVFSD := dir.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+	dir := d.fs.newDir(ctx, creds, opts.Mode, nil)
+	if err := d.OrderedChildren.Insert(name, dir); err != nil {
 		dir.DecRef(ctx)
 		return nil, err
 	}
+	d.TouchCMtime(ctx)
 	d.IncLinks(1)
-	return dirVFSD, nil
+	return dir, nil
 }
 
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
-	f := d.fs.newFile(creds, "")
-	fVFSD := f.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+	f := d.fs.newFile(ctx, creds, "")
+	if err := d.OrderedChildren.Insert(name, f); err != nil {
 		f.DecRef(ctx)
 		return nil, err
 	}
-	return fVFSD, nil
+	d.TouchCMtime(ctx)
+	return f, nil
 }
 
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -197,19 +206,23 @@ func (fsType) Name() string {
 	return "kernfs"
 }
 
+func (fsType) Release(ctx context.Context) {}
+
 func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
 	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
-	root := fst.rootFn(creds, fs)
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	root := fst.rootFn(ctx, creds, fs)
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), d.VFSDentry(), nil
 }
 
 // -------------------- Remainder of the file are test cases --------------------
 
 func TestBasic(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"file1": fs.newFile(creds, staticFileContent),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
@@ -217,9 +230,9 @@ func TestBasic(t *testing.T) {
 }
 
 func TestMkdirGetDentry(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"dir1": fs.newDir(creds, 0755, nil),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"dir1": fs.newDir(ctx, creds, 0755, nil),
 		})
 	})
 	defer sys.Destroy()
@@ -232,9 +245,9 @@ func TestMkdirGetDentry(t *testing.T) {
 }
 
 func TestReadStaticFile(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"file1": fs.newFile(creds, staticFileContent),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
@@ -258,9 +271,9 @@ func TestReadStaticFile(t *testing.T) {
 }
 
 func TestCreateNewFileInStaticDir(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"dir1": fs.newDir(creds, 0755, nil),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"dir1": fs.newDir(ctx, creds, 0755, nil),
 		})
 	})
 	defer sys.Destroy()
@@ -285,8 +298,8 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 }
 
 func TestDirFDReadWrite(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, nil)
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, nil)
 	})
 	defer sys.Destroy()
 
@@ -309,14 +322,14 @@ func TestDirFDReadWrite(t *testing.T) {
 }
 
 func TestDirFDIterDirents(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
 			// Fill root with nodes backed by various inode implementations.
-			"dir1": fs.newReadonlyDir(creds, 0755, nil),
-			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
-				"dir3": fs.newDir(creds, 0755, nil),
+			"dir1": fs.newReadonlyDir(ctx, creds, 0755, nil),
+			"dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+				"dir3": fs.newDir(ctx, creds, 0755, nil),
 			}),
-			"file1": fs.newFile(creds, staticFileContent),
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..a0736c0d6 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,10 +24,13 @@ import (
 
 // StaticSymlink provides an Inode implementation for symlinks that point to
 // a immutable target.
+//
+// +stateify savable
 type StaticSymlink struct {
 	InodeAttrs
 	InodeNoopRefCount
 	InodeSymlink
+	InodeNoStatFS
 
 	target string
 }
@@ -35,23 +38,20 @@ type StaticSymlink struct {
 var _ Inode = (*StaticSymlink)(nil)
 
 // NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
 	inode := &StaticSymlink{}
-	inode.Init(creds, devMajor, devMinor, ino, target)
-
-	d := &Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(ctx, creds, devMajor, devMinor, ino, target)
+	return inode
 }
 
 // Init initializes the instance.
-func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
+func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
 	s.target = target
-	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
+	s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
 }
 
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
 	return s.target, nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..463d77d79
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,113 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+	InodeAlwaysValid
+	InodeAttrs
+	InodeNoStatFS
+	InodeNotSymlink
+	OrderedChildren
+	syntheticDirectoryRefs
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(ctx context.Context, creds *auth.Credentials, perm linux.FileMode) Inode {
+	inode := &syntheticDirectory{}
+	inode.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+	return inode
+}
+
+func (dir *syntheticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+	}
+	dir.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+	dir.OrderedChildren.Init(OrderedChildrenOptions{
+		Writable: true,
+	})
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) {
+	if !opts.ForSyntheticMountpoint {
+		return nil, syserror.EPERM
+	}
+	subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+	if err := dir.OrderedChildren.Insert(name, subdirI); err != nil {
+		subdirI.DecRef(ctx)
+		return nil, err
+	}
+	dir.TouchCMtime(ctx)
+	return subdirI, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// DecRef implements Inode.DecRef.
+func (dir *syntheticDirectory) DecRef(ctx context.Context) {
+	dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) })
+}
+
+// Keep implements Inode.Keep. This is redundant because inodes will never be
+// created via Lookup and inodes are always valid. Makes sense to return true
+// because these inodes are not temporary and should only be removed on RmDir.
+func (dir *syntheticDirectory) Keep() bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
index 8cf5b35d3..fd6c55921 100644
--- a/pkg/sentry/fsimpl/overlay/BUILD
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -21,14 +21,18 @@ go_library(
         "directory.go",
         "filesystem.go",
         "fstree.go",
-        "non_directory.go",
         "overlay.go",
+        "regular_file.go",
+        "save_restore.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -37,5 +41,6 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index b3d19ff82..4506642ca 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -40,6 +42,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return nil
 	}
 
+	// Attach our credentials to the context, as some VFS operations use
+	// credentials from context rather an take an explicit creds parameter.
+	ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
 	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
 	switch ftype {
 	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
@@ -69,13 +75,28 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return syserror.ENOENT
 	}
 
-	// Perform copy-up.
+	// Obtain settable timestamps from the lower layer.
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	oldpop := vfs.PathOperation{
+		Root:  d.lowerVDs[0],
+		Start: d.lowerVDs[0],
+	}
+	const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME
+	oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{
+		Mask: timestampsMask,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Perform copy-up.
 	newpop := vfs.PathOperation{
 		Root:  d.parent.upperVD,
 		Start: d.parent.upperVD,
 		Path:  fspath.Parse(d.name),
 	}
+	// Used during copy-up of memory-mapped regular files.
+	var mmapOpts *memmap.MMapOpts
 	cleanupUndoCopyUp := func() {
 		var err error
 		if ftype == linux.S_IFDIR {
@@ -84,15 +105,16 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
 		}
 		if err != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+		}
+		if d.upperVD.Ok() {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
 		}
 	}
 	switch ftype {
 	case linux.S_IFREG:
-		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.OpenOptions{
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{
 			Flags: linux.O_RDONLY,
 		})
 		if err != nil {
@@ -127,11 +149,32 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 				break
 			}
 		}
+		d.mapsMu.Lock()
+		defer d.mapsMu.Unlock()
+		if d.wrappedMappable != nil {
+			// We may have memory mappings of the file on the lower layer.
+			// Switch to mapping the file on the upper layer instead.
+			mmapOpts = &memmap.MMapOpts{
+				Perms:    usermem.ReadWrite,
+				MaxPerms: usermem.ReadWrite,
+			}
+			if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+				cleanupUndoCopyUp()
+				return err
+			}
+			if mmapOpts.MappingIdentity != nil {
+				mmapOpts.MappingIdentity.DecRef(ctx)
+			}
+			// Don't actually switch Mappables until the end of copy-up; see
+			// below for why.
+		}
 		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -148,9 +191,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -164,10 +209,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFLNK:
-		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		})
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop)
 		if err != nil {
 			return err
 		}
@@ -176,10 +218,12 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
-				Mode: uint16(d.mode),
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				Mode:  uint16(d.mode),
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -193,25 +237,20 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFBLK, linux.S_IFCHR:
-		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.StatOptions{})
-		if err != nil {
-			return err
-		}
 		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
 			Mode:     linux.FileMode(d.mode),
-			DevMajor: lowerStat.RdevMajor,
-			DevMinor: lowerStat.RdevMinor,
+			DevMajor: oldStat.RdevMajor,
+			DevMinor: oldStat.RdevMinor,
 		}); err != nil {
 			return err
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -229,7 +268,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		panic(fmt.Sprintf("unexpected file type %o", ftype))
 	}
 
-	// TODO(gvisor.dev/issue/1199): copy up xattrs
+	if err := d.copyXattrsLocked(ctx); err != nil {
+		cleanupUndoCopyUp()
+		return err
+	}
 
 	// Update the dentry's device and inode numbers (except for directories,
 	// for which these remain overlay-assigned).
@@ -241,14 +283,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			Mask: linux.STATX_INO,
 		})
 		if err != nil {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return err
 		}
 		if upperStat.Mask&linux.STATX_INO == 0 {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return syserror.EREMOTE
 		}
@@ -257,6 +295,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		atomic.StoreUint64(&d.ino, upperStat.Ino)
 	}
 
+	if mmapOpts != nil && mmapOpts.Mappable != nil {
+		// Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+		// (from the S_IFREG path above).
+
+		// Propagate mappings of d to the new Mappable. Remember which mappings
+		// we added so we can remove them on failure.
+		upperMappable := mmapOpts.Mappable
+		allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			added := make(memmap.MappingsOfRange)
+			for m := range seg.Value() {
+				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+					for m := range added {
+						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+					}
+					for mr, mappings := range allAdded {
+						for m := range mappings {
+							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+						}
+					}
+					return err
+				}
+				added[m] = struct{}{}
+			}
+			allAdded[seg.Range()] = added
+		}
+
+		// Switch to the new Mappable. We do this at the end of copy-up
+		// because:
+		//
+		// - We need to switch Mappables (by changing d.wrappedMappable) before
+		// invalidating Translations from the old Mappable (to pick up
+		// Translations from the new one).
+		//
+		// - We need to lock d.dataMu while changing d.wrappedMappable, but
+		// must invalidate Translations with d.dataMu unlocked (due to lock
+		// ordering).
+		//
+		// - Consequently, once we unlock d.dataMu, other threads may
+		// immediately observe the new (copied-up) Mappable, which we want to
+		// delay until copy-up is guaranteed to succeed.
+		d.dataMu.Lock()
+		lowerMappable := d.wrappedMappable
+		d.wrappedMappable = upperMappable
+		d.dataMu.Unlock()
+		d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+		// Remove mappings from the old Mappable.
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			for m := range seg.Value() {
+				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+			}
+		}
+		d.lowerMappings.RemoveAll()
+	}
+
 	atomic.StoreUint32(&d.copiedUp, 1)
 	return nil
 }
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+	upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+	lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+	if err != nil {
+		if err == syserror.EOPNOTSUPP {
+			// There are no guarantees as to the contents of lowerXattrs.
+			return nil
+		}
+		ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+		return err
+	}
+
+	for _, name := range lowerXattrs {
+		// Do not copy up overlay attributes.
+		if isOverlayXattr(name) {
+			continue
+		}
+
+		value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+		if err != nil {
+			ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+			return err
+		}
+
+		if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+			ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+			return err
+		}
+	}
+	return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+	dirents, err := d.getDirentsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	for _, dirent := range dirents {
+		if dirent.Name == "." || dirent.Name == ".." {
+			continue
+		}
+		child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+		if err != nil {
+			return err
+		}
+		if err := child.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		if child.isDir() {
+			child.dirMu.Lock()
+			err := child.copyUpDescendantsLocked(ctx, ds)
+			child.dirMu.Unlock()
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 6a79f7ffe..df4492346 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -29,7 +29,9 @@ func (d *dentry) isDir() bool {
 	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	var readdirErr error
@@ -98,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string
 	return whiteouts, readdirErr
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -114,10 +117,12 @@ func (fd *directoryFD) Release(ctx context.Context) {
 
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	d := fd.dentry()
+	defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
-	d := fd.dentry()
 	if fd.dirents == nil {
 		ds, err := d.getDirents(ctx)
 		if err != nil {
@@ -141,7 +146,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
+	return d.getDirentsLocked(ctx)
+}
 
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
 	if d.dirents != nil {
 		return d.dirents, nil
 	}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 86d0164b4..10161a08d 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,8 @@
 package overlay
 
 import (
+	"fmt"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +29,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
 // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
 // opaque directories.
 // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
 
 func isWhiteout(stat *linux.Statx) bool {
 	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -110,8 +117,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 // Dentries which may have a reference count of zero, and which therefore
 // should be dropped once traversal is complete, are appended to ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -159,7 +168,9 @@ afterSymlink:
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if child, ok := parent.children[name]; ok {
 		return child, nil
@@ -177,7 +188,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
 	childPath := fspath.Parse(name)
 	child := fs.newDentry()
@@ -199,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 			lookupErr = err
 			return false
 		}
+		defer childVD.DecRef(ctx)
 
 		mask := uint32(linux.STATX_TYPE)
 		if !existsOnAnyLayer {
@@ -237,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		}
 
 		// Update child to include this layer.
+		childVD.IncRef()
 		if isUpper {
 			child.upperVD = childVD
 			child.copiedUp = 1
@@ -261,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 
 		// Directories are merged with directories from lower layers if they
 		// are not explicitly opaque.
-		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
 			Root:  childVD,
 			Start: childVD,
-		}, &vfs.GetxattrOptions{
+		}, &vfs.GetXattrOptions{
 			Name: _OVL_XATTR_OPAQUE,
 			Size: 1,
 		})
@@ -287,8 +302,14 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		child.devMinor = fs.dirDevMinor
 		child.ino = fs.newDirIno()
 	} else if !child.upperVD.Ok() {
+		childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor)
+		if err != nil {
+			ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
+			child.destroyLocked(ctx)
+			return nil, err
+		}
 		child.devMajor = linux.UNNAMED_MAJOR
-		child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+		child.devMinor = childDevMinor
 	}
 
 	parent.IncRef()
@@ -300,7 +321,9 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 // lookupLayerLocked is similar to lookupLocked, but only returns information
 // about the file rather than a dentry.
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
 	childPath := fspath.Parse(name)
 	lookupLayer := lookupLayerNone
@@ -385,7 +408,9 @@ func (ll lookupLayer) existsInOverlay() bool {
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -425,8 +450,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // doCreateAt checks that creating a file at rp is permitted, then invokes
 // create to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -479,7 +505,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
 		return err
 	}
+
 	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 	return nil
 }
 
@@ -493,7 +525,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
 
 func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
 	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
 	}
 }
 
@@ -605,12 +637,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
 			}
 			return err
 		}
+		old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	})
 }
@@ -644,7 +677,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -654,12 +687,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			// There may be directories on lower layers (previously hidden by
 			// the whiteout) that the new directory should not be merged with.
 			// Mark it opaque to prevent merging.
-			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
 				Name:  _OVL_XATTR_OPAQUE,
 				Value: "y",
 			}); err != nil {
 				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
 				} else {
 					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 				}
@@ -703,7 +736,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -732,10 +765,13 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	start := rp.Start().Impl().(*dentry)
 	if rp.Done() {
+		if mayCreate && rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if mayWrite {
+		if start.isRegularFile() && mayWrite {
 			if err := start.copyUpLocked(ctx); err != nil {
 				return nil, err
 			}
@@ -755,6 +791,10 @@ afterTrailingSymlink:
 	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
+	// Reject attempts to open directories with O_CREAT.
+	if mayCreate && rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -763,12 +803,11 @@ afterTrailingSymlink:
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
 	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
@@ -783,7 +822,10 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	if mayWrite {
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if child.isRegularFile() && mayWrite {
 		if err := child.copyUpLocked(ctx); err != nil {
 			return nil, err
 		}
@@ -836,8 +878,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	if err != nil {
 		return nil, err
 	}
+	if ftype != linux.S_IFREG {
+		return layerFD, nil
+	}
 	layerFlags := layerFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    isUpper,
 		cachedFD:    layerFD,
 		cachedFlags: layerFlags,
@@ -851,8 +896,9 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: parent.dirMu must be locked. parent does not already contain
-// a child named rp.Component().
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
 func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	creds := rp.Credentials()
 	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
@@ -913,7 +959,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		},
 	}); err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -924,7 +970,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	child, err := fs.getChildLocked(ctx, parent, childName, ds)
 	if err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -932,7 +978,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	}
 	// Finally construct the overlay FD.
 	upperFlags := upperFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    true,
 		cachedFD:    upperFD,
 		cachedFlags: upperFlags,
@@ -945,6 +991,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		// just can't open it anymore for some reason.
 		return nil, err
 	}
+	parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
 	return &fd.vfsfd, nil
 }
 
@@ -990,9 +1037,224 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 	defer mnt.EndWrite()
 
-	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
-	_ = newParent
-	return syserror.EXDEV
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+	if err != nil {
+		return err
+	}
+	if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+		return err
+	}
+	if renamed.isDir() {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+	if err != nil {
+		return err
+	}
+	var (
+		replaced     *dentry
+		replacedVFSD *vfs.Dentry
+		whiteouts    map[string]bool
+	)
+	if replacedLayer.existsInOverlay() {
+		replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+		if err != nil {
+			return err
+		}
+		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
+			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
+			replaced.dirMu.Lock()
+			defer replaced.dirMu.Unlock()
+			whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+			if err != nil {
+				return err
+			}
+		} else {
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+
+	// renamed and oldParent need to be copied-up before they're renamed on the
+	// upper layer.
+	if err := renamed.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If renamed is a directory, all of its descendants need to be copied-up
+	// before they're renamed on the upper layer.
+	if renamed.isDir() {
+		if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+			return err
+		}
+	}
+	// newParent must be copied-up before it can contain renamed on the upper
+	// layer.
+	if err := newParent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If replaced exists, it doesn't need to be copied-up, but we do need to
+	// serialize with copy-up. Holding renameMu for writing should be
+	// sufficient, but out of an abundance of caution...
+	if replaced != nil {
+		replaced.copyMu.RLock()
+		defer replaced.copyMu.RUnlock()
+	}
+
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+
+	newpop := vfs.PathOperation{
+		Root:  newParent.upperVD,
+		Start: newParent.upperVD,
+		Path:  fspath.Parse(newName),
+	}
+
+	needRecreateWhiteouts := false
+	cleanupRecreateWhiteouts := func() {
+		if !needRecreateWhiteouts {
+			return
+		}
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+				Root:  replaced.upperVD,
+				Start: replaced.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil && err != syserror.EEXIST {
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+			}
+		}
+	}
+	if renamed.isDir() {
+		if replacedLayer == lookupLayerUpper {
+			// Remove whiteouts from the directory being replaced.
+			needRecreateWhiteouts = true
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  replaced.upperVD,
+					Start: replaced.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil {
+					cleanupRecreateWhiteouts()
+					vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+					return err
+				}
+			}
+		} else if replacedLayer == lookupLayerUpperWhiteout {
+			// We need to explicitly remove the whiteout since otherwise rename
+			// on the upper layer will fail with ENOTDIR.
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+				return err
+			}
+		}
+	}
+
+	// Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+	// regular rename and create the whiteout at the origin manually. Unlike
+	// RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+	// upper filesystem, but this is already the case for virtually all other
+	// overlay filesystem operations too.
+	oldpop := vfs.PathOperation{
+		Root:  oldParent.upperVD,
+		Start: oldParent.upperVD,
+		Path:  fspath.Parse(oldName),
+	}
+	if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+		cleanupRecreateWhiteouts()
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+
+	// Below this point, the renamed dentry is now at newpop, and anything we
+	// replaced is gone forever. Commit the rename, update the overlay
+	// filesystem tree, and abandon attempts to recover from errors.
+	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+	delete(oldParent.children, oldName)
+	if replaced != nil {
+		ds = appendDentry(ds, replaced)
+	}
+	if oldParent != newParent {
+		newParent.dirents = nil
+		// This can't drop the last reference on oldParent because one is held
+		// by oldParentVD, so lock recursion is impossible.
+		oldParent.DecRef(ctx)
+		ds = appendDentry(ds, oldParent)
+		newParent.IncRef()
+		renamed.parent = newParent
+	}
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+	oldParent.dirents = nil
+
+	if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+	}
+	if renamed.isDir() {
+		if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+			Name:  _OVL_XATTR_OPAQUE,
+			Value: "y",
+		}); err != nil {
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+		}
+	}
+
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1040,6 +1302,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !child.isDir() {
 		return syserror.ENOTDIR
 	}
+	if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+		return err
+	}
 	child.dirMu.Lock()
 	defer child.dirMu.Unlock()
 	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
@@ -1071,7 +1336,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 					Start: child.upperVD,
 					Path:  fspath.Parse(whiteoutName),
 				}); err != nil && err != syserror.EEXIST {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
 				}
 			}
 		}
@@ -1101,15 +1366,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		// Don't attempt to recover from this: the original directory is
 		// already gone, so any dentries representing it are invalid, and
 		// creating a new directory won't undo that.
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
-		vfsObj.AbortDeleteDentry(&child.vfsd)
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
 	}
 
 	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 	delete(parent.children, name)
 	ds = appendDentry(ds, child)
 	parent.dirents = nil
+	parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
 	return nil
 }
 
@@ -1117,12 +1381,25 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 		return err
 	}
+	err = d.setStatLocked(ctx, rp, opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
+	}
+	return nil
+}
 
+// Precondition: d.fs.renameMu must be held for reading.
+func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
 	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
@@ -1217,7 +1494,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -1263,12 +1540,38 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 
+	parentMode := atomic.LoadUint32(&parent.mode)
 	child := parent.children[name]
 	var childLayer lookupLayer
+	if child == nil {
+		if parentMode&linux.S_ISVTX != 0 {
+			// If the parent's sticky bit is set, we need a child dentry to get
+			// its owner.
+			child, err = fs.getChildLocked(ctx, parent, name, &ds)
+			if err != nil {
+				return err
+			}
+		} else {
+			// Determine if the file being unlinked actually exists. Holding
+			// parent.dirMu prevents a dentry from being instantiated for the file,
+			// which in turn prevents it from being copied-up, so this result is
+			// stable.
+			childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+			if err != nil {
+				return err
+			}
+			if !childLayer.existsInOverlay() {
+				return syserror.ENOENT
+			}
+		}
+	}
 	if child != nil {
 		if child.isDir() {
 			return syserror.EISDIR
 		}
+		if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+			return err
+		}
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 			return err
 		}
@@ -1281,18 +1584,6 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		} else {
 			childLayer = lookupLayerLower
 		}
-	} else {
-		// Determine if the file being unlinked actually exists. Holding
-		// parent.dirMu prevents a dentry from being instantiated for the file,
-		// which in turn prevents it from being copied-up, so this result is
-		// stable.
-		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
-		if err != nil {
-			return err
-		}
-		if !childLayer.existsInOverlay() {
-			return syserror.ENOENT
-		}
 	}
 
 	pop := vfs.PathOperation{
@@ -1310,70 +1601,175 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		}
 	}
 	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
-		if child != nil {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
-		}
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
 	}
 
+	var cw *vfs.Watches
 	if child != nil {
 		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 		delete(parent.children, name)
 		ds = appendDentry(ds, child)
+		cw = &child.watches
 	}
+	vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
 	parent.dirents = nil
 	return nil
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
-	// but not any other xattr syscalls. For now we just reject all of them.
-	return nil, syserror.ENOTSUP
+
+	return fs.listXattr(ctx, d, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+	if err != nil {
+		return nil, err
+	}
+
+	// Filter out all overlay attributes.
+	n := 0
+	for _, name := range names {
+		if !isOverlayXattr(name) {
+			names[n] = name
+			n++
+		}
+	}
+	return names[:n], err
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return "", err
 	}
-	return "", syserror.ENOTSUP
+
+	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
+}
+
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+		return "", err
+	}
+
+	// Return EOPNOTSUPP when fetching an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_get().
+	if isOverlayXattr(opts.Name) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+
+	err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Return EOPNOTSUPP when setting an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_set().
+	if isOverlayXattr(opts.Name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+
+	err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+	// Linux passes the remove request to xattr_handler->set.
+	// See fs/xattr.c:vfs_removexattr().
+	if isOverlayXattr(name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
deleted file mode 100644
index d3060a481..000000000
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package overlay
-
-import (
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func (d *dentry) isSymlink() bool {
-	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
-}
-
-func (d *dentry) readlink(ctx context.Context) (string, error) {
-	layerVD := d.topLayer()
-	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-		Root:  layerVD,
-		Start: layerVD,
-	})
-}
-
-type nonDirectoryFD struct {
-	fileDescription
-
-	// If copiedUp is false, cachedFD represents
-	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
-	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
-	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
-	// protected by mu.
-	mu          sync.Mutex
-	copiedUp    bool
-	cachedFD    *vfs.FileDescription
-	cachedFlags uint32
-}
-
-func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return nil, err
-	}
-	wrappedFD.IncRef()
-	return wrappedFD, nil
-}
-
-func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
-	d := fd.dentry()
-	statusFlags := fd.vfsfd.StatusFlags()
-	if !fd.copiedUp && d.isCopiedUp() {
-		// Switch to the copied-up file.
-		upperVD := d.topLayer()
-		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  upperVD,
-			Start: upperVD,
-		}, &vfs.OpenOptions{
-			Flags: statusFlags,
-		})
-		if err != nil {
-			return nil, err
-		}
-		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
-		if oldOffErr == nil {
-			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
-				upperFD.DecRef(ctx)
-				return nil, err
-			}
-		}
-		fd.cachedFD.DecRef(ctx)
-		fd.copiedUp = true
-		fd.cachedFD = upperFD
-		fd.cachedFlags = statusFlags
-	} else if fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
-			return nil, err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	return fd.cachedFD, nil
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nonDirectoryFD) Release(ctx context.Context) {
-	fd.cachedFD.DecRef(ctx)
-	fd.cachedFD = nil
-}
-
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
-	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
-	// equivalent to OnClose is a no-op). We pass through to
-	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
-	// copied-up, since OnClose is mostly used to define post-close writeback,
-	// and if fd.cachedFD hasn't been updated then it can't have been used to
-	// mutate fd.dentry() anyway.
-	fd.mu.Lock()
-	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
-			fd.mu.Unlock()
-			return err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	wrappedFD := fd.cachedFD
-	defer wrappedFD.IncRef()
-	fd.mu.Unlock()
-	return wrappedFD.OnClose(ctx)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
-		wrappedFD, err := fd.getCurrentFD(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
-			Mask: layerMask,
-			Sync: opts.Sync,
-		})
-		wrappedFD.DecRef(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-	}
-	fd.dentry().statInternalTo(ctx, &opts, &stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	d := fd.dentry()
-	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
-	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
-		return err
-	}
-	mnt := fd.vfsfd.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	if err := d.copyUpLocked(ctx); err != nil {
-		return err
-	}
-	// Changes to d's attributes are serialized by d.copyMu.
-	d.copyMu.Lock()
-	defer d.copyMu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return err
-	}
-	if err := wrappedFD.SetStat(ctx, opts); err != nil {
-		return err
-	}
-	d.updateAfterSetStatLocked(&opts)
-	return nil
-}
-
-// StatFS implements vfs.FileDescriptionImpl.StatFS.
-func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
-	return fd.filesystem().statFS(ctx)
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PRead(ctx, dst, offset, opts)
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	// Hold fd.mu during the read to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Read(ctx, dst, opts)
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PWrite(ctx, src, offset, opts)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	// Hold fd.mu during the write to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Write(ctx, src, opts)
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	// Hold fd.mu during the seek to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Seek(ctx, offset, whence)
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
-	fd.mu.Lock()
-	if !fd.dentry().isCopiedUp() {
-		fd.mu.Unlock()
-		return nil
-	}
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		fd.mu.Unlock()
-		return err
-	}
-	wrappedFD.IncRef()
-	defer wrappedFD.DecRef(ctx)
-	fd.mu.Unlock()
-	return wrappedFD.Sync(ctx)
-}
-
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.ConfigureMMap(ctx, opts)
-}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 75cc006bf..c812f0a70 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -18,10 +18,15 @@
 //
 // Lock order:
 //
-// directoryFD.mu / nonDirectoryFD.mu
+// directoryFD.mu / regularFileFD.mu
 //   filesystem.renameMu
 //     dentry.dirMu
 //       dentry.copyMu
+//         filesystem.devMu
+//         *** "memmap.Mappable locks" below this point
+//         dentry.mapsMu
+//           *** "memmap.Mappable locks taken by Translate" below this point
+//           dentry.dataMu
 //
 // Locking dentry.dirMu in multiple dentries requires that parent dentries are
 // locked before child dentries, and that filesystem.renameMu is locked to
@@ -29,14 +34,17 @@
 package overlay
 
 import (
+	"fmt"
 	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -46,6 +54,8 @@ import (
 const Name = "overlay"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -53,8 +63,13 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
 // FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type FilesystemOptions struct {
 	// Callers passing FilesystemOptions to
 	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
@@ -71,6 +86,8 @@ type FilesystemOptions struct {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -85,97 +102,93 @@ type filesystem struct {
 	// is immutable.
 	dirDevMinor uint32
 
-	// lowerDevMinors maps lower layer filesystems to device minor numbers
-	// assigned to non-directory files originating from that filesystem.
-	// lowerDevMinors is immutable.
-	lowerDevMinors map[*vfs.Filesystem]uint32
+	// lowerDevMinors maps device numbers from lower layer filesystems to
+	// device minor numbers assigned to non-directory files originating from
+	// that filesystem. (This remapping is necessary for lower layers because a
+	// file on a lower layer, and that same file on an overlay, are
+	// distinguishable because they will diverge after copy-up; this isn't true
+	// for non-directory files already on the upper layer.) lowerDevMinors is
+	// protected by devMu.
+	devMu          sync.Mutex `state:"nosave"`
+	lowerDevMinors map[layerDevNumber]uint32
 
 	// renameMu synchronizes renaming with non-renaming operations in order to
 	// ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// lastDirIno is the last inode number assigned to a directory. lastDirIno
 	// is accessed using atomic memory operations.
 	lastDirIno uint64
 }
 
+// +stateify savable
+type layerDevNumber struct {
+	major uint32
+	minor uint32
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mopts := vfs.GenericParseMountOptions(opts.Data)
 	fsoptsRaw := opts.InternalData
-	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
-	if fsoptsRaw != nil && !haveFSOpts {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+	fsopts, ok := fsoptsRaw.(FilesystemOptions)
+	if fsoptsRaw != nil && !ok {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
 		return nil, nil, syserror.EINVAL
 	}
-	if haveFSOpts {
-		if len(fsopts.LowerRoots) == 0 {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+	vfsroot := vfs.RootFromContext(ctx)
+	if vfsroot.Ok() {
+		defer vfsroot.DecRef(ctx)
+	}
+
+	if upperPathname, ok := mopts["upperdir"]; ok {
+		if fsopts.UpperRoot.Ok() {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
 			return nil, nil, syserror.EINVAL
 		}
-		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+		delete(mopts, "upperdir")
+		// Linux overlayfs also requires a workdir when upperdir is
+		// specified; we don't, so silently ignore this option.
+		delete(mopts, "workdir")
+		upperPath := fspath.Parse(upperPathname)
+		if !upperPath.Absolute {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
 			return nil, nil, syserror.EINVAL
 		}
-		// We don't enforce a maximum number of lower layers when not
-		// configured by applications; the sandbox owner can have an overlay
-		// filesystem with any number of lower layers.
-	} else {
-		vfsroot := vfs.RootFromContext(ctx)
-		defer vfsroot.DecRef(ctx)
-		upperPathname, ok := mopts["upperdir"]
-		if ok {
-			delete(mopts, "upperdir")
-			// Linux overlayfs also requires a workdir when upperdir is
-			// specified; we don't, so silently ignore this option.
-			delete(mopts, "workdir")
-			upperPath := fspath.Parse(upperPathname)
-			if !upperPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
-				return nil, nil, syserror.EINVAL
-			}
-			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
-				Root:               vfsroot,
-				Start:              vfsroot,
-				Path:               upperPath,
-				FollowFinalSymlink: true,
-			}, &vfs.GetDentryOptions{
-				CheckSearchable: true,
-			})
-			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
-				return nil, nil, err
-			}
-			defer upperRoot.DecRef(ctx)
-			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
-			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
-				return nil, nil, err
-			}
-			defer privateUpperRoot.DecRef(ctx)
-			fsopts.UpperRoot = privateUpperRoot
+		upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+			Root:               vfsroot,
+			Start:              vfsroot,
+			Path:               upperPath,
+			FollowFinalSymlink: true,
+		}, &vfs.GetDentryOptions{
+			CheckSearchable: true,
+		})
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+			return nil, nil, err
+		}
+		privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+		upperRoot.DecRef(ctx)
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+			return nil, nil, err
 		}
-		lowerPathnamesStr, ok := mopts["lowerdir"]
-		if !ok {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+		defer privateUpperRoot.DecRef(ctx)
+		fsopts.UpperRoot = privateUpperRoot
+	}
+
+	if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
+		if len(fsopts.LowerRoots) != 0 {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
 			return nil, nil, syserror.EINVAL
 		}
 		delete(mopts, "lowerdir")
 		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
-		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
-		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
-			return nil, nil, syserror.EINVAL
-		}
-		if len(lowerPathnames) > maxLowerLayers {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
-			return nil, nil, syserror.EINVAL
-		}
 		for _, lowerPathname := range lowerPathnames {
 			lowerPath := fspath.Parse(lowerPathname)
 			if !lowerPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
 				return nil, nil, syserror.EINVAL
 			}
 			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -187,44 +200,44 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 				CheckSearchable: true,
 			})
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
-			defer lowerRoot.DecRef(ctx)
 			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+			lowerRoot.DecRef(ctx)
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
 			defer privateLowerRoot.DecRef(ctx)
 			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
 		}
 	}
+
 	if len(mopts) != 0 {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	if len(fsopts.LowerRoots) == 0 {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
+		return nil, nil, syserror.EINVAL
+	}
+	if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
+		return nil, nil, syserror.EINVAL
+	}
+	const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+	if len(fsopts.LowerRoots) > maxLowerLayers {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
 		return nil, nil, syserror.EINVAL
 	}
 
-	// Allocate device numbers.
+	// Allocate dirDevMinor. lowerDevMinors are allocated dynamically.
 	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
 	}
-	lowerDevMinors := make(map[*vfs.Filesystem]uint32)
-	for _, lowerRoot := range fsopts.LowerRoots {
-		lowerFS := lowerRoot.Mount().Filesystem()
-		if _, ok := lowerDevMinors[lowerFS]; !ok {
-			devMinor, err := vfsObj.GetAnonBlockDevMinor()
-			if err != nil {
-				vfsObj.PutAnonBlockDevMinor(dirDevMinor)
-				for _, lowerDevMinor := range lowerDevMinors {
-					vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
-				}
-				return nil, nil, err
-			}
-			lowerDevMinors[lowerFS] = devMinor
-		}
-	}
 
 	// Take extra references held by the filesystem.
 	if fsopts.UpperRoot.Ok() {
@@ -238,7 +251,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		opts:           fsopts,
 		creds:          creds.Fork(),
 		dirDevMinor:    dirDevMinor,
-		lowerDevMinors: lowerDevMinors,
+		lowerDevMinors: make(map[layerDevNumber]uint32),
 	}
 	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
@@ -274,7 +287,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, syserror.EREMOTE
 	}
 	if isWhiteout(&rootStat) {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
 		root.destroyLocked(ctx)
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, syserror.EINVAL
@@ -288,7 +301,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		root.ino = fs.newDirIno()
 	} else if !root.upperVD.Ok() {
 		root.devMajor = linux.UNNAMED_MAJOR
-		root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+		rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor)
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
+			root.destroyLocked(ctx)
+			fs.vfsfs.DecRef(ctx)
+			return nil, nil, err
+		}
+		root.devMinor = rootDevMinor
 		root.ino = rootStat.Ino
 	} else {
 		root.devMajor = rootStat.DevMajor
@@ -315,7 +335,11 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
 	if err != nil {
 		return vfs.VirtualDentry{}, err
 	}
-	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+	// Take a reference on the dentry which will be owned by the returned
+	// VirtualDentry.
+	d := vd.Dentry()
+	d.IncRef()
+	return vfs.MakeVirtualDentry(newmnt, d), nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -357,7 +381,24 @@ func (fs *filesystem) newDirIno() uint64 {
 	return atomic.AddUint64(&fs.lastDirIno, 1)
 }
 
+func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
+	fs.devMu.Lock()
+	defer fs.devMu.Unlock()
+	orig := layerDevNumber{layerMajor, layerMinor}
+	if minor, ok := fs.lowerDevMinors[orig]; ok {
+		return minor, nil
+	}
+	minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
+	if err != nil {
+		return 0, err
+	}
+	fs.lowerDevMinors[orig] = minor
+	return minor, nil
+}
+
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -390,7 +431,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 	dirents  []vfs.Dirent
 
@@ -400,7 +441,7 @@ type dentry struct {
 	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
 	// be copied up) with copyMu locked for writing; otherwise, it is
 	// immutable. lowerVDs is always immutable.
-	copyMu   sync.RWMutex
+	copyMu   sync.RWMutex `state:"nosave"`
 	upperVD  vfs.VirtualDentry
 	lowerVDs []vfs.VirtualDentry
 
@@ -415,7 +456,43 @@ type dentry struct {
 	devMinor uint32
 	ino      uint64
 
+	// If this dentry represents a regular file, then:
+	//
+	// - mapsMu is used to synchronize between copy-up and memmap.Mappable
+	// methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+	//
+	// - dataMu is used to synchronize between copy-up and
+	// dentry.(memmap.Mappable).Translate.
+	//
+	// - lowerMappings tracks memory mappings of the file. lowerMappings is
+	// used to invalidate mappings of the lower layer when the file is copied
+	// up to ensure that they remain coherent with subsequent writes to the
+	// file. (Note that, as of this writing, Linux overlayfs does not do this;
+	// this feature is a gVisor extension.) lowerMappings is protected by
+	// mapsMu.
+	//
+	// - If this dentry is copied-up, then wrappedMappable is the Mappable
+	// obtained from a call to the current top layer's
+	// FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+	// (from a call to regularFileFD.ensureMappable()), it cannot become nil.
+	// wrappedMappable is protected by mapsMu and dataMu.
+	//
+	// - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+	// accessed using atomic memory operations.
+	mapsMu          sync.Mutex `state:"nosave"`
+	lowerMappings   memmap.MappingSet
+	dataMu          sync.RWMutex `state:"nosave"`
+	wrappedMappable memmap.Mappable
+	isMappable      uint32
+
 	locks vfs.FileLocks
+
+	// watches is the set of inotify watches on the file repesented by this dentry.
+	//
+	// Note that hard links to the same file will not share the same set of
+	// watches, due to the fact that we do not have inode structures in this
+	// overlay implementation.
+	watches vfs.Watches
 }
 
 // newDentry creates a new dentry. The dentry initially has no references; it
@@ -428,6 +505,9 @@ func (fs *filesystem) newDentry() *dentry {
 	}
 	d.lowerVDs = d.inlineLowerVDs[:0]
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "overlay.dentry")
+	}
 	return d
 }
 
@@ -475,6 +555,14 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 	if atomic.LoadInt64(&d.refs) != 0 {
 		return
 	}
+
+	// Make sure that we do not lose watches on dentries that have not been
+	// deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
+	// d.vfsd.IsDead() indicates that d was deleted.
+	if !d.vfsd.IsDead() && d.watches.Size() > 0 {
+		return
+	}
+
 	// Refs is still zero; destroy it.
 	d.destroyLocked(ctx)
 	return
@@ -482,7 +570,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 
 // destroyLocked destroys the dentry.
 //
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
@@ -501,6 +591,8 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 		lowerVD.DecRef(ctx)
 	}
 
+	d.watches.HandleDeletion(ctx)
+
 	if d.parent != nil {
 		d.parent.dirMu.Lock()
 		if !d.vfsd.IsDead() {
@@ -515,23 +607,48 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 			panic("overlay.dentry.DecRef() called without holding a reference")
 		}
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "overlay.dentry")
+	}
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
 }
 
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+	}
+	d.watches.Notify(ctx, "", events, cookie, et, deleted)
+	d.fs.renameMu.RUnlock()
 }
 
 // Watches implements vfs.DentryImpl.Watches.
 func (d *dentry) Watches() *vfs.Watches {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
-	return nil
+	return &d.watches
 }
 
 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) OnZeroWatches(context.Context) {}
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	}
+}
 
 // iterLayers invokes yield on each layer comprising d, from top to bottom. If
 // any call to yield returns false, iterLayer stops iteration.
@@ -564,6 +681,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 // statInternalMask is the set of stat fields that is set by
 // dentry.statInternalTo().
 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -602,6 +729,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
 
 // fileDescription is embedded by overlay implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -616,6 +745,48 @@ func (fd *fileDescription) dentry() *dentry {
 	return fd.vfsfd.Dentry().Impl().(*dentry)
 }
 
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go
new file mode 100644
index 000000000..2b89a7a6d
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -0,0 +1,456 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG
+}
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// +stateify savable
+type regularFileFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex `state:"nosave"`
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+
+	// If copiedUp is false, lowerWaiters contains all waiter.Entries
+	// registered with cachedFD. lowerWaiters is protected by mu.
+	lowerWaiters map[*waiter.Entry]waiter.EventMask
+}
+
+func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef(ctx)
+				return nil, err
+			}
+		}
+		if len(fd.lowerWaiters) != 0 {
+			ready := upperFD.Readiness(^waiter.EventMask(0))
+			for e, mask := range fd.lowerWaiters {
+				fd.cachedFD.EventUnregister(e)
+				upperFD.EventRegister(e, mask)
+				if ready&mask != 0 {
+					e.Callback.Callback(e)
+				}
+			}
+		}
+		fd.cachedFD.DecRef(ctx)
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+		fd.lowerWaiters = nil
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(ctx context.Context) {
+	fd.cachedFD.DecRef(ctx)
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ctx := context.Background()
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since Readiness can't return
+		// an error. This is obviously wrong, but at least consistent with
+		// VFS1.
+		log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err)
+		fd.mu.Lock()
+		wrappedFD = fd.cachedFD
+		wrappedFD.IncRef()
+		fd.mu.Unlock()
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(context.Background())
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since EventRegister can't
+		// return an error. This is obviously wrong, but at least consistent
+		// with VFS1.
+		log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err)
+		wrappedFD = fd.cachedFD
+	}
+	wrappedFD.EventRegister(e, mask)
+	if !fd.copiedUp {
+		if fd.lowerWaiters == nil {
+			fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask)
+		}
+		fd.lowerWaiters[e] = mask
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *regularFileFD) EventUnregister(e *waiter.Entry) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	fd.cachedFD.EventUnregister(e)
+	if !fd.copiedUp {
+		delete(fd.lowerWaiters, e)
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef(ctx)
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Ioctl(ctx, uio, args)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	if err := fd.ensureMappable(ctx, opts); err != nil {
+		return err
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+
+	// Fast path if we already have a Mappable for the current top layer.
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+
+	// Only permit mmap of regular files, since other file types may have
+	// unpredictable behavior when mmapped (e.g. /dev/zero).
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+		return syserror.ENODEV
+	}
+
+	// Get a Mappable for the current top layer.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	d.copyMu.RLock()
+	defer d.copyMu.RUnlock()
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+		return err
+	}
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.DecRef(ctx)
+		opts.MappingIdentity = nil
+	}
+	// Use this Mappable for all mappings of this layer (unless we raced with
+	// another call to ensureMappable).
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if d.wrappedMappable == nil {
+		d.wrappedMappable = opts.Mappable
+		atomic.StoreUint32(&d.isMappable, 1)
+	}
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, ar, offset, writable)
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+	if !d.isCopiedUp() {
+		d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+	}
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.dataMu.RLock()
+	defer d.dataMu.RUnlock()
+	return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	return d.wrappedMappable.InvalidateUnsavable(ctx)
+}
diff --git a/pkg/sentry/fsimpl/overlay/save_restore.go b/pkg/sentry/fsimpl/overlay/save_restore.go
new file mode 100644
index 000000000..054e17b17
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "overlay.dentry")
+	}
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 2ca793db9..e44b79b68 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type filesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -38,11 +39,15 @@ func (filesystemType) Name() string {
 	return "pipefs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("pipefs.filesystemType.GetFilesystem should never be called")
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -76,6 +81,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
@@ -143,12 +150,14 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 	return syserror.EPERM
 }
 
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
@@ -159,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
 	fs := mnt.Filesystem().Impl().(*filesystem)
 	inode := newInode(ctx, fs)
 	var d kernfs.Dentry
-	d.Init(inode)
+	d.Init(&fs.Filesystem, inode)
 	defer d.DecRef(ctx)
 	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
 }
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 14ecfd300..5196a2a80 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "fd_dir_inode_refs",
+    out = "fd_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdDirInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "fdDirInode",
+    },
+)
+
+go_template_instance(
+    name = "fd_info_dir_inode_refs",
+    out = "fd_info_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdInfoDirInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "fdInfoDirInode",
+    },
+)
+
+go_template_instance(
+    name = "subtasks_inode_refs",
+    out = "subtasks_inode_refs.go",
+    package = "proc",
+    prefix = "subtasksInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "subtasksInode",
+    },
+)
+
+go_template_instance(
+    name = "task_inode_refs",
+    out = "task_inode_refs.go",
+    package = "proc",
+    prefix = "taskInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "taskInode",
+    },
+)
+
+go_template_instance(
+    name = "tasks_inode_refs",
+    out = "tasks_inode_refs.go",
+    package = "proc",
+    prefix = "tasksInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "tasksInode",
+    },
+)
+
 go_library(
     name = "proc",
     srcs = [
+        "fd_dir_inode_refs.go",
+        "fd_info_dir_inode_refs.go",
         "filesystem.go",
         "subtasks.go",
+        "subtasks_inode_refs.go",
         "task.go",
         "task_fds.go",
         "task_files.go",
+        "task_inode_refs.go",
         "task_net.go",
         "tasks.go",
         "tasks_files.go",
+        "tasks_inode_refs.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -21,6 +82,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
@@ -39,6 +101,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 2463d51cd..99abcab66 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -17,6 +17,7 @@ package proc
 
 import (
 	"fmt"
+	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -24,23 +25,29 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Name is the default filesystem name.
-const Name = "proc"
+const (
+	// Name is the default filesystem name.
+	Name                     = "proc"
+	defaultMaxCachedDentries = uint64(1000)
+)
 
 // FilesystemType is the factory class for procfs.
 //
 // +stateify savable
 type FilesystemType struct{}
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -61,9 +68,22 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
 	if err != nil {
 		return nil, nil, err
 	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	maxCachedDentries := defaultMaxCachedDentries
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
 	procfs := &filesystem{
 		devMinor: devMinor,
 	}
+	procfs.MaxCachedDentries = maxCachedDentries
 	procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
 
 	var cgroups map[string]string
@@ -72,7 +92,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
 		cgroups = data.Cgroups
 	}
 
-	_, dentry := procfs.newTasksInode(k, pidns, cgroups)
+	inode := procfs.newTasksInode(ctx, k, pidns, cgroups)
+	var dentry kernfs.Dentry
+	dentry.Init(&procfs.Filesystem, inode)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
@@ -84,21 +106,21 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // dynamicInode is an overfitted interface for common Inodes with
 // dynamicByteSource types used in procfs.
+//
+// +stateify savable
 type dynamicInode interface {
 	kernfs.Inode
 	vfs.DynamicBytesSource
 
-	Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+	Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
 }
 
-func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
+	return inode
 }
 
+// +stateify savable
 type staticFile struct {
 	kernfs.DynamicBytesFile
 	vfs.StaticData
@@ -110,8 +132,24 @@ func newStaticFile(data string) *staticFile {
 	return &staticFile{StaticData: vfs.StaticData{Data: data}}
 }
 
+func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
+	return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
+}
+
 // InternalData contains internal data passed in to the procfs mount via
 // vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
 type InternalData struct {
 	Cgroups map[string]string
 }
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 79c2725f3..cb3c5e0fd 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,11 +31,14 @@ import (
 //
 // +stateify savable
 type subtasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	subtasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -47,7 +50,7 @@ type subtasksInode struct {
 
 var _ kernfs.Inode = (*subtasksInode)(nil)
 
-func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
 	subInode := &subtasksInode{
 		fs:                fs,
 		task:              task,
@@ -55,18 +58,16 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
 		cgroupControllers: cgroupControllers,
 	}
 	// Note: credentials are overridden by taskOwnedInode.
-	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	subInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: subInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	tid, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -79,13 +80,11 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
 	if subTask.ThreadGroup() != i.task.ThreadGroup() {
 		return nil, syserror.ENOENT
 	}
-
-	subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
-	return subTaskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
 	if len(tasks) == 0 {
 		return offset, syserror.ENOENT
@@ -115,6 +114,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	return offset, nil
 }
 
+// +stateify savable
 type subtasksFD struct {
 	kernfs.GenericDirectoryFD
 
@@ -152,19 +152,21 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 	return fd.GenericDirectoryFD.SetStat(ctx, opts)
 }
 
-// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	}); err != nil {
 		return nil, err
 	}
-	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
@@ -176,7 +178,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
 	return stat, nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *subtasksInode) DecRef(ctx context.Context) {
+	i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a5c7aa470..57cf8ce26 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,11 +32,13 @@ import (
 //
 // +stateify savable
 type taskInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
-	kernfs.InodeNoDynamicLookup
+	implStatFS
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
+	taskInodeRefs
 
 	locks vfs.FileLocks
 
@@ -45,80 +47,92 @@ type taskInode struct {
 
 var _ kernfs.Inode = (*taskInode)(nil)
 
-func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
-	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
-	contents := map[string]*kernfs.Dentry{
-		"auxv":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
-		"cmdline":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) {
+	if task.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+
+	contents := map[string]kernfs.Inode{
+		"auxv":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
 		"comm":      fs.newComm(task, fs.NextIno(), 0444),
-		"environ":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"cwd":       fs.newCwdSymlink(task, fs.NextIno()),
+		"environ":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
 		"exe":       fs.newExeSymlink(task, fs.NextIno()),
 		"fd":        fs.newFDDirInode(task),
 		"fdinfo":    fs.newFDInfoDirInode(task),
-		"gid_map":   fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
-		"io":        fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
-		"mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
-		"mounts":    fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+		"gid_map":   fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}),
 		"net":       fs.newTaskNetDir(task),
-		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{
 			"net":  fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
 			"pid":  fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
 			"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
 		}),
-		"oom_score":     fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
-		"oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
-		"smaps":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
-		"stat":          fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
-		"status":        fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map":       fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}),
+		"status":        fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
 	}
 	if len(cgroupControllers) > 0 {
-		contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+		contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
 	}
 
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
-	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: taskInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
 
 	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	links := taskInode.OrderedChildren.Populate(contents)
 	taskInode.IncLinks(links)
 
-	return dentry
+	return inode, nil
 }
 
-// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// Valid implements kernfs.Inode.Valid. This inode remains valid as long
 // as the task is still running. When it's dead, another tasks with the same
 // PID could replace it.
 func (i *taskInode) Valid(ctx context.Context) bool {
 	return i.task.ExitState() != kernel.TaskExitDead
 }
 
-// Open implements kernfs.Inode.
-func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *taskInode) DecRef(ctx context.Context) {
+	i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
 // effective user and group.
+//
+// +stateify savable
 type taskOwnedInode struct {
 	kernfs.Inode
 
@@ -128,34 +142,26 @@ type taskOwnedInode struct {
 
 var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return &taskOwnedInode{Inode: inode, owner: task}
 }
 
-func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
-	dir := &kernfs.StaticDirectory{}
-
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
-	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
-
-	inode := &taskOwnedInode{Inode: dir, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(inode)
+	fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
+	dir := kernfs.NewStaticDir(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
 
-	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := dir.OrderedChildren.Populate(d, children)
-	dir.IncLinks(links)
+	return &taskOwnedInode{Inode: dir, owner: task}
+}
 
-	return d
+func (i *taskOwnedInode) Valid(ctx context.Context) bool {
+	return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.Inode.Stat(ctx, fs, opts)
 	if err != nil {
@@ -173,7 +179,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
 	return stat, nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index f0d3f7f5e..d268b44be 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -52,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
 	return true
 }
 
+// +stateify savable
 type fdDir struct {
 	locks vfs.FileLocks
 
@@ -63,8 +63,8 @@ type fdDir struct {
 	produceSymlink bool
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	var fds []int32
 	i.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
@@ -87,31 +87,39 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
 			Name:    strconv.FormatUint(uint64(fd), 10),
 			Type:    typ,
 			Ino:     i.fs.NextIno(),
-			NextOff: offset + 1,
+			NextOff: int64(fd) + 3,
 		}
 		if err := cb.Handle(dirent); err != nil {
-			return offset, err
+			// Getdents should iterate correctly despite mutation
+			// of fds, so we return the next fd to serialize plus
+			// 2 (which accounts for the "." and ".." tracked by
+			// kernfs) as the offset.
+			return int64(fd) + 2, err
 		}
-		offset++
 	}
-	return offset, nil
+	// We serialized them all.  Next offset should be higher than last
+	// serialized fd.
+	return int64(fds[len(fds)-1]) + 3, nil
 }
 
 // fdDirInode represents the inode for /proc/[pid]/fd directory.
 //
 // +stateify savable
 type fdDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdDirInode)(nil)
 
-func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdDirInode{
 		fdDir: fdDir{
 			fs:             fs,
@@ -119,17 +127,19 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
 			produceSymlink: true,
 		},
 	}
-	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	return inode
+}
 
-	return dentry
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -138,20 +148,21 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 	if !taskFDExists(ctx, i.task, fd) {
 		return nil, syserror.ENOENT
 	}
-	taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 //
 // This is to match Linux, which uses a special permission handler to guarantee
 // that a process can still access /proc/self/fd after it has executed
@@ -173,10 +184,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 	return err
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdDirInode) DecRef(ctx context.Context) {
+	i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
 //
 // +stateify savable
 type fdSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -187,19 +204,16 @@ type fdSymlink struct {
 
 var _ kernfs.Inode = (*fdSymlink)(nil)
 
-func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
 	inode := &fdSymlink{
 		task: task,
 		fd:   fd,
 	}
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	file, _ := getTaskFD(s.task, s.fd)
 	if file == nil {
 		return "", syserror.ENOENT
@@ -221,38 +235,43 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
 	return vd, "", nil
 }
 
+// Valid implements kernfs.Inode.Valid.
+func (s *fdSymlink) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, s.task, s.fd)
+}
+
 // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
 //
 // +stateify savable
 type fdInfoDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdInfoDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdInfoDirInode)(nil)
 
-func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdInfoDirInode{
 		fdDir: fdDir{
 			fs:   fs,
 			task: task,
 		},
 	}
-	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -265,25 +284,35 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 		task: i.task,
 		fd:   fd,
 	}
-	dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
-	return dentry.VFSDentry(), nil
+	return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// IterDirents implements Inode.IterDirents.
+func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdInfoDirInode) DecRef(ctx context.Context) {
+	i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
 //
 // +stateify savable
 type fdInfoData struct {
 	kernfs.DynamicBytesFile
-	refs.AtomicRefCount
 
 	task *kernel.Task
 	fd   int32
@@ -305,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
 	return nil
 }
+
+// Valid implements kernfs.Inode.Valid.
+func (d *fdInfoData) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, d.task, d.fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 830b78949..d3f4e259b 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -247,13 +247,10 @@ type commInode struct {
 	task *kernel.Task
 }
 
-func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
 	inode := &commInode{task: task}
-	inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.DynamicBytesFile.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
+	return inode
 }
 
 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -543,7 +540,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var vss, rss, data uint64
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+			fds = fdTable.CurrentMaxFDs()
 		}
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
@@ -648,6 +645,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 //
 // +stateify savable
 type exeSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -657,29 +655,30 @@ type exeSymlink struct {
 
 var _ kernfs.Inode = (*exeSymlink)(nil)
 
-func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
 	inode := &exeSymlink{task: task}
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
 }
 
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
-	if !kernel.ContextCanTrace(ctx, s.task, false) {
-		return "", syserror.EACCES
-	}
-
-	// Pull out the executable for /proc/[pid]/exe.
-	exec, err := s.executable()
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	exec, _, err := s.Getlink(ctx, nil)
 	if err != nil {
 		return "", err
 	}
 	defer exec.DecRef(ctx)
 
-	return exec.PathnameWithDeleted(ctx), nil
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
+	return name, nil
 }
 
 // Getlink implements kernfs.Inode.Getlink.
@@ -687,23 +686,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
 	if !kernel.ContextCanTrace(ctx, s.task, false) {
 		return vfs.VirtualDentry{}, "", syserror.EACCES
 	}
-
-	exec, err := s.executable()
-	if err != nil {
-		return vfs.VirtualDentry{}, "", err
-	}
-	defer exec.DecRef(ctx)
-
-	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
-	vd.IncRef()
-	return vd, "", nil
-}
-
-func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	if err := checkTaskState(s.task); err != nil {
-		return nil, err
+		return vfs.VirtualDentry{}, "", err
 	}
 
+	var err error
+	var exec fsbridge.File
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
@@ -714,12 +702,75 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		file = mm.Executable()
-		if file == nil {
+		exec = mm.Executable()
+		if exec == nil {
 			err = syserror.ESRCH
 		}
 	})
-	return
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef(ctx)
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
+//
+// +stateify savable
+type cwdSymlink struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*cwdSymlink)(nil)
+
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
+	inode := &cwdSymlink{task: task}
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	cwd, _, err := s.Getlink(ctx, nil)
+	if err != nil {
+		return "", err
+	}
+	defer cwd.DecRef(ctx)
+
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
+	return name, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	cwd := s.task.FSContext().WorkingDirectoryVFS2()
+	if !cwd.Ok() {
+		// It could have raced with process deletion.
+		return vfs.VirtualDentry{}, "", syserror.ESRCH
+	}
+	return cwd, "", nil
 }
 
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
@@ -784,13 +835,14 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// +stateify savable
 type namespaceSymlink struct {
 	kernfs.StaticSymlink
 
 	task *kernel.Task
 }
 
-func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode {
 	// Namespace symlinks should contain the namespace name and the inode number
 	// for the namespace instance, so for example user:[123456]. We currently fake
 	// the inode number by sticking the symlink inode in its place.
@@ -798,40 +850,44 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
 
 	inode := &namespaceSymlink{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
 
 	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return taskInode
 }
 
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return "", err
 	}
-	return s.StaticSymlink.Readlink(ctx)
+	return s.StaticSymlink.Readlink(ctx, mnt)
 }
 
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return vfs.VirtualDentry{}, "", err
 	}
 
 	// Create a synthetic inode to represent the namespace.
+	fs := mnt.Filesystem().Impl().(*filesystem)
+	nsInode := &namespaceInode{}
+	nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
 	dentry := &kernfs.Dentry{}
-	dentry.Init(&namespaceInode{})
+	dentry.Init(&fs.Filesystem, nsInode)
 	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
-	vd.IncRef()
-	dentry.DecRef(ctx)
+	// Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
+	mnt.IncRef()
 	return vd, "", nil
 }
 
 // namespaceInode is a synthetic inode created to represent a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -843,19 +899,19 @@ type namespaceInode struct {
 var _ kernfs.Inode = (*namespaceInode)(nil)
 
 // Init initializes a namespace inode.
-func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 }
 
-// Open implements Inode.Open.
-func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
 	fd.LockFD.Init(&i.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -863,6 +919,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 
 // namespace FD is a synthetic file that represents a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
@@ -873,20 +931,20 @@ type namespaceFD struct {
 
 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
 
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	return fd.inode.Stat(ctx, vfs, opts)
 }
 
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode.SetStat(ctx, vfs, creds, opts)
 }
 
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (fd *namespaceFD) Release(ctx context.Context) {
 	fd.inode.DecRef(ctx)
 }
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index a4c884bf9..5a9ee111f 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -37,12 +37,12 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode {
 	k := task.Kernel()
 	pidns := task.PIDNamespace()
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 
-	var contents map[string]*kernfs.Dentry
+	var contents map[string]kernfs.Inode
 	if stack := task.NetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
@@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
 
 		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
 		// network namespace.
-		contents = map[string]*kernfs.Dentry{
-			"dev":  fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+		contents = map[string]kernfs.Inode{
+			"dev":  fs.newInode(task, root, 0444, &netDevData{stack: stack}),
+			"snmp": fs.newInode(task, root, 0444, &netSnmpData{stack: stack}),
 
 			// The following files are simple stubs until they are implemented in
 			// netstack, if the file contains a header the stub is just the header
 			// otherwise it is an empty file.
-			"arp":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
-			"packet":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+			"arp":       fs.newInode(task, root, 0444, newStaticFile(arp)),
+			"netlink":   fs.newInode(task, root, 0444, newStaticFile(netlink)),
+			"netstat":   fs.newInode(task, root, 0444, &netStatData{}),
+			"packet":    fs.newInode(task, root, 0444, newStaticFile(packet)),
+			"protocols": fs.newInode(task, root, 0444, newStaticFile(protocols)),
 
 			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
 			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+			"psched": fs.newInode(task, root, 0444, newStaticFile(psched)),
+			"ptype":  fs.newInode(task, root, 0444, newStaticFile(ptype)),
+			"route":  fs.newInode(task, root, 0444, &netRouteData{stack: stack}),
+			"tcp":    fs.newInode(task, root, 0444, &netTCPData{kernel: k}),
+			"udp":    fs.newInode(task, root, 0444, &netUDPData{kernel: k}),
+			"unix":   fs.newInode(task, root, 0444, &netUnixData{kernel: k}),
 		}
 
 		if stack.SupportsIPv6() {
-			contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+			contents["if_inet6"] = fs.newInode(task, root, 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = fs.newInode(task, root, 0444, newStaticFile(""))
+			contents["tcp6"] = fs.newInode(task, root, 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = fs.newInode(task, root, 0444, newStaticFile(upd6))
 		}
 	}
 
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// For now, we always redact this pointer.
 		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			s.Refs()-1,                    // RefCount, don't count our own ref.
+			s.ReadRefs()-1,                // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -616,6 +616,7 @@ type netSnmpData struct {
 
 var _ dynamicInode = (*netSnmpData)(nil)
 
+// +stateify savable
 type snmpLine struct {
 	prefix string
 	header string
@@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string {
 	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
 }
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	types := []interface{}{
 		&inet.StatSNMPIP{},
@@ -709,7 +710,7 @@ type netRouteData struct {
 
 var _ dynamicInode = (*netRouteData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +774,7 @@ type netStatData struct {
 
 var _ dynamicInode = (*netStatData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 6d2b90a8b..b81ea14bf 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,11 +37,14 @@ const (
 //
 // +stateify savable
 type tasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	tasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -50,8 +53,6 @@ type tasksInode struct {
 
 	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
 	// Linux. So handle them outside of OrderedChildren.
-	selfSymlink       *vfs.Dentry
-	threadSelfSymlink *vfs.Dentry
 
 	// cgroupControllers is a map of controller name to directory in the
 	// cgroup hierarchy. These controllers are immutable and will be listed
@@ -61,51 +62,53 @@ type tasksInode struct {
 
 var _ kernfs.Inode = (*tasksInode)(nil)
 
-func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
-	contents := map[string]*kernfs.Dentry{
-		"cpuinfo":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
-		"filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
-		"loadavg":     fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
-		"sys":         fs.newSysDir(root, k),
-		"meminfo":     fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
-		"mounts":      kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
-		"net":         kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
-		"stat":        fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
-		"uptime":      fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
-		"version":     fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+	contents := map[string]kernfs.Inode{
+		"cpuinfo":     fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
+		"filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}),
+		"loadavg":     fs.newInode(ctx, root, 0444, &loadavgData{}),
+		"sys":         fs.newSysDir(ctx, root, k),
+		"meminfo":     fs.newInode(ctx, root, 0444, &meminfoData{}),
+		"mounts":      kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
+		"net":         kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
+		"stat":        fs.newInode(ctx, root, 0444, &statData{}),
+		"uptime":      fs.newInode(ctx, root, 0444, &uptimeData{}),
+		"version":     fs.newInode(ctx, root, 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
 		pidns:             pidns,
 		fs:                fs,
-		selfSymlink:       fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
-		threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
 		cgroupControllers: cgroupControllers,
 	}
-	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, contents)
+	links := inode.OrderedChildren.Populate(contents)
 	inode.IncLinks(links)
 
-	return inode, dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	// Try to lookup a corresponding task.
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry. Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
+		root := auth.NewRootCredentials(i.pidns.UserNamespace())
 		// If it failed to parse, check if it's one of the special handled files.
 		switch name {
 		case selfName:
-			return i.selfSymlink, nil
+			return i.newSelfSymlink(ctx, root), nil
 		case threadSelfName:
-			return i.threadSelfSymlink, nil
+			return i.newThreadSelfSymlink(ctx, root), nil
 		}
 		return nil, syserror.ENOENT
 	}
@@ -115,12 +118,11 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 		return nil, syserror.ENOENT
 	}
 
-	taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
 	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
 	const FIRST_PROCESS_ENTRY = 256
 
@@ -197,9 +199,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 	return maxTaskID, nil
 }
 
-// Open implements kernfs.Inode.
-func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -224,9 +228,16 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
 	return stat, nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *tasksInode) DecRef(ctx context.Context) {
+	i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // staticFileSetStat implements a special static file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type staticFileSetStat struct {
 	dynamicBytesFileSetAttr
 	vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..01b7a6678 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,7 +31,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type selfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -41,16 +43,13 @@ type selfSymlink struct {
 
 var _ kernfs.Inode = (*selfSymlink)(nil)
 
-func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &selfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	inode := &selfSymlink{pidns: i.pidns}
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -63,17 +62,19 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// +stateify savable
 type threadSelfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -83,16 +84,13 @@ type threadSelfSymlink struct {
 
 var _ kernfs.Inode = (*threadSelfSymlink)(nil)
 
-func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &threadSelfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	inode := &threadSelfSymlink{pidns: i.pidns}
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -106,12 +104,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
@@ -119,16 +117,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
 // dynamicBytesFileSetAttr implements a special file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type dynamicBytesFileSetAttr struct {
 	kernfs.DynamicBytesFile
 }
 
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
 func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
 }
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
 type cpuStats struct {
 	// user is time spent in userspace tasks with non-positive niceness.
 	user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6435385ef..7c7afdcfa 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -27,9 +27,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type tcpMemDir int
 
 const (
@@ -38,92 +40,93 @@ const (
 )
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
-func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
-			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
-			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
-			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+			"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
+			"shmall":   fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
+			"shmmax":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
+			"shmmni":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
 		}),
-		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
-			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+			"mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
+			"overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
 		}),
-		"net": fs.newSysNetDir(root, k),
+		"net": fs.newSysNetDir(ctx, root, k),
 	})
 }
 
 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	var contents map[string]*kernfs.Dentry
+func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	var contents map[string]kernfs.Inode
 
 	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
 	// network namespace of the calling process.
 	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
-		contents = map[string]*kernfs.Dentry{
-			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
-				"tcp_rmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
-				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
-				"tcp_wmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+		contents = map[string]kernfs.Inode{
+			"ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+				"tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
+				"tcp_rmem":     fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
+				"tcp_sack":     fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
+				"tcp_wmem":     fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+				"ip_forward":   fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
 				// value closest to the actual netstack behavior or any empty file, all
 				// of these files will have mode 0444 (read-only for all users).
-				"ip_local_port_range":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000   65535")),
-				"ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"ipfrag_time":             fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
-				"ip_nonlocal_bind":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"ip_no_pmtu_disc":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"ip_local_port_range":     fs.newInode(ctx, root, 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),
 
 				// tcp_allowed_congestion_control tell the user what they are able to
 				// do as an unprivledged process so we leave it empty.
-				"tcp_allowed_congestion_control":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
-				"tcp_congestion_control":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),
 
 				// Many of the following stub files are features netstack doesn't
 				// support. The unsupported features return "0" to indicate they are
 				// disabled.
-				"tcp_base_mss":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
-				"tcp_dsack":                 fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_early_retrans":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fack":                  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen_key":          fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_invalid_ratelimit":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_intvl":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_probes":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_time":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
-				"tcp_mtu_probing":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_no_metrics_save":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_probe_interval":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_probe_threshold":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_retries1":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_retries2":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
-				"tcp_rfc1337":               fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_synack_retries":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
+				"tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
+				"tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
+				"tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
+				"tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
+				"tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
 			}),
-			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
-				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
-				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"optmem_max":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"rmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"rmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"somaxconn":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
-				"wmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"wmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+			"core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+				"default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
+				"message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
+				"message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
+				"optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
+				"wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
 			}),
 		}
 	}
 
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+	return fs.newStaticDir(ctx, root, contents)
 }
 
 // mmapMinAddrData implements vfs.DynamicBytesSource for
@@ -174,7 +177,7 @@ type tcpSackData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	if d.enabled == nil {
 		sack, err := d.stack.TCPSACKEnabled()
@@ -232,7 +235,7 @@ type tcpRecoveryData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	recovery, err := d.stack.TCPRecovery()
 	if err != nil {
@@ -284,7 +287,7 @@ type tcpMemData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -354,3 +357,63 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
 		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
 	}
 }
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if ipf.enabled == nil {
+		enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+		ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	if ipf.enabled == nil {
+		ipf.enabled = new(bool)
+	}
+	*ipf.enabled = v != 0
+	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index be54897bb..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newIPv6TestStack() *inet.TestStack {
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
 		t.Errorf("Got n.contents() = %v, want = %v", got, want)
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+
+			file := &ipForwarding{stack: s, enabled: &c.initial}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if got, want := s.IPForwarding, c.final; got != want {
+				t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 3c9297dee..2582ababd 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -67,6 +67,7 @@ var (
 	taskStaticFiles = map[string]testutil.DirentType{
 		"auxv":          linux.DT_REG,
 		"cgroup":        linux.DT_REG,
+		"cwd":           linux.DT_LNK,
 		"cmdline":       linux.DT_REG,
 		"comm":          linux.DT_REG,
 		"environ":       linux.DT_REG,
@@ -104,13 +105,16 @@ func setup(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
+	root := mntns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
 	pop := &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
@@ -118,8 +122,8 @@ func setup(t *testing.T) *testutil.System {
 	}
 
 	pop = &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	mntOpts := &vfs.MountOptions{
@@ -132,7 +136,7 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+	if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
 		t.Fatalf("MountAt(/proc): %v", err)
 	}
 	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
index 067c1657f..adb610213 100644
--- a/pkg/sentry/fsimpl/signalfd/BUILD
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/kernel",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..10f1452ef 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -16,7 +16,6 @@ package signalfd
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -26,7 +25,9 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
 type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -43,7 +44,7 @@ type SignalFileDescription struct {
 	target *kernel.Task
 
 	// mu protects mask.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// mask is the signal mask. Protected by mu.
 	mask linux.SignalSet
@@ -83,7 +84,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
 	sfd.mask = mask
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	// Attempt to dequeue relevant signals.
 	info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -93,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 	}
 
 	// Copy out the signal info using the specified format.
-	var buf [128]byte
-	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+	infoNative := linux.SignalfdSiginfo{
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
@@ -103,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
-	})
-	n, err := dst.CopyOut(ctx, buf[:])
-	return int64(n), err
+	}
+	n, err := infoNative.WriteTo(dst.Writer(ctx))
+	if err == usermem.ErrEndOfIOSequence {
+		// Partial copy-out ok.
+		err = nil
+	}
+	return n, err
 }
 
 // Readiness implements waiter.Waitable.Readiness.
@@ -132,5 +136,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
 	sfd.target.SignalUnregister(entry)
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c61818ff6..fda1fa942 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,14 +28,16 @@ import (
 )
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("sockfs.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 //
 // Note that registering sockfs is unnecessary, except for the fact that it
 // will not show up under /proc/filesystems as a result. This is a very minor
@@ -44,6 +46,10 @@ func (filesystemType) Name() string {
 	return "sockfs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -80,30 +86,37 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ENXIO
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
 // NewDentry constructs and returns a sockfs dentry.
 //
 // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
-func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
+func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry {
 	fs := mnt.Filesystem().Impl().(*filesystem)
 
 	// File mode matches net/socket.c:sock_alloc.
 	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
 	i := &inode{}
-	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
+	i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
 
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	d.Init(&fs.Filesystem, i)
 	return d.VFSDentry()
 }
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1b548ccd4..09043b572 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,21 +1,42 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "sys",
+    prefix = "dir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
 go_library(
     name = "sys",
     srcs = [
+        "dir_refs.go",
+        "kcov.go",
         "sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/coverage",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..b13f141a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,118 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	k := &kcovInode{}
+	k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+	return k
+}
+
+// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
+type kcovInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	implStatFS
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		panic("KernelFromContext returned nil")
+	}
+	fd := &kcovFD{
+		inode: i,
+		kcov:  k.NewKcov(),
+	}
+
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
+		DenyPRead:  true,
+		DenyPWrite: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// +stateify savable
+type kcovFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.NoLockFD
+
+	vfsfd vfs.FileDescription
+	inode *kcovInode
+	kcov  *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	cmd := uint32(args[1].Int())
+	arg := args[2].Uint64()
+	switch uint32(cmd) {
+	case linux.KCOV_INIT_TRACE:
+		return 0, fd.kcov.InitTrace(arg)
+	case linux.KCOV_ENABLE:
+		return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+	case linux.KCOV_DISABLE:
+		if arg != 0 {
+			// This arg is unused; it should be 0.
+			return 0, syserror.EINVAL
+		}
+		return 0, fd.kcov.DisableTrace(ctx)
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+	// kcov instances have reference counts in Linux, but this seems sufficient
+	// for our purposes.
+	fd.kcov.Clear(ctx)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0401726b6..7d2147141 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -18,9 +18,11 @@ package sys
 import (
 	"bytes"
 	"fmt"
+	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -28,14 +30,21 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Name is the default filesystem name.
-const Name = "sysfs"
-const defaultSysDirMode = linux.FileMode(0755)
+const (
+	// Name is the default filesystem name.
+	Name                     = "sysfs"
+	defaultSysDirMode        = linux.FileMode(0755)
+	defaultMaxCachedDentries = uint64(1000)
+)
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -47,6 +56,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -54,44 +66,73 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, err
 	}
 
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	maxCachedDentries := defaultMaxCachedDentries
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
 	fs := &filesystem{
 		devMinor: devMinor,
 	}
+	fs.MaxCachedDentries = maxCachedDentries
 	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
 
-	root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-		"block": fs.newDir(creds, defaultSysDirMode, nil),
-		"bus":   fs.newDir(creds, defaultSysDirMode, nil),
-		"class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-			"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
+	root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+		"block": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"bus":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"class": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+			"power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
 		}),
-		"dev": fs.newDir(creds, defaultSysDirMode, nil),
-		"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-			"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"dev": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"devices": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+			"system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
 				"cpu": cpuDir(ctx, fs, creds),
 			}),
 		}),
-		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
-		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
-		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
-		"module":   fs.newDir(creds, defaultSysDirMode, nil),
-		"power":    fs.newDir(creds, defaultSysDirMode, nil),
+		"firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"fs":       fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"kernel":   kernelDir(ctx, fs, creds),
+		"module":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"power":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
 	})
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
 }
 
-func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
 	k := kernel.KernelFromContext(ctx)
 	maxCPUCores := k.ApplicationCores()
-	children := map[string]*kernfs.Dentry{
-		"online":   fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
-		"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
-		"present":  fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+	children := map[string]kernfs.Inode{
+		"online":   fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+		"possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+		"present":  fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
 	}
 	for i := uint(0); i < maxCPUCores; i++ {
-		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil)
+		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
 	}
-	return fs.newDir(creds, defaultSysDirMode, children)
+	return fs.newDir(ctx, creds, defaultSysDirMode, children)
+}
+
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
+	// If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+	// should be mounted at debug/, but for our purposes, it is sufficient to
+	// keep it in sys.
+	var children map[string]kernfs.Inode
+	if coverage.KcovAvailable() {
+		children = map[string]kernfs.Inode{
+			"debug": fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
+				"kcov": fs.newKcovFile(ctx, creds),
+			}),
+		}
+	}
+	return fs.newDir(ctx, creds, defaultSysDirMode, children)
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -101,46 +142,62 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dir implements kernfs.Inode.
+//
+// +stateify savable
 type dir struct {
+	dirRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	d := &dir{}
-	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	d.dentry.Init(d)
-
-	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
-
-	return &d.dentry
+	d.EnableLeakCheck()
+	d.IncLinks(d.OrderedChildren.Populate(contents))
+	return d
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
 // cpuFile implements kernfs.Inode.
+//
+// +stateify savable
 type cpuFile struct {
+	implStatFS
 	kernfs.DynamicBytesFile
+
 	maxCores uint
 }
 
@@ -150,10 +207,16 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
 	c := &cpuFile{maxCores: maxCores}
-	c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
-	d := &kernfs.Dentry{}
-	d.Init(c)
-	return d
+	c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
+	return c
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
 }
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 1813269e0..738c0c9cc 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
 		FDTable:                 k.NewFDTable(),
 	}
-	return k.TaskSet().NewTask(config)
+	t, err := k.TaskSet().NewTask(ctx, config)
+	if err != nil {
+		config.ThreadGroup.Release(ctx)
+		return nil, err
+	}
+	return t, nil
 }
 
 func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 568132121..1a8525b06 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -46,16 +46,18 @@ type System struct {
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on MntNs, whose ownership
+// Precondition: Caller must hold a reference on mns, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+	root := mns.Root()
+	root.IncRef()
 	s := &System{
 		t:     t,
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
 		MntNs: mns,
-		Root:  mns.Root(),
+		Root:  root,
 	}
 	return s
 }
@@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
 	defer d.mu.Unlock()
 	dirent, ok := d.dirents[name]
 	if !ok {
-		return fmt.Errorf("No dirent named %q found", name)
+		return fmt.Errorf("no dirent named %q found", name)
 	}
 	if dirent.Type != typ {
-		return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+		return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
 	}
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,8 +26,10 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
 // implements ktime.TimerListener.
+//
+// +stateify savable
 type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -62,7 +64,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
 	return &tfd.vfsfd, nil
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	const sizeofUint64 = 8
 	if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +130,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
 	tfd.timer.Resume()
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (tfd *TimerFileDescription) Release(context.Context) {
 	tfd.timer.Destroy()
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 5cd428d64..fe520b6fd 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -31,7 +31,7 @@ go_template_instance(
     out = "inode_refs.go",
     package = "tmpfs",
     prefix = "inode",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "inode",
     },
@@ -48,6 +48,7 @@ go_library(
         "inode_refs.go",
         "named_pipe.go",
         "regular_file.go",
+        "save_restore.go",
         "socket_file.go",
         "symlink.go",
         "tmpfs.go",
@@ -60,6 +61,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d263147c2..3cc63e732 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 
 			// Create nested directories with given depth.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			vd := root
 			vd.IncRef()
@@ -376,7 +377,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 
 			// Create the mount point.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			pop := vfs.PathOperation{
 				Root:  root,
@@ -405,7 +407,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef(ctx)
 			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+			if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index ac54d420d..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
+// +stateify savable
 type deviceFile struct {
 	inode inode
 	kind  vfs.DeviceKind
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 78b4fc5be..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// +stateify savable
 type directory struct {
 	// Since directories can't be hard-linked, each directory can only be
 	// associated with a single dentry, which we can store in the directory
@@ -44,7 +45,7 @@ type directory struct {
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
 	// efficiently. childList is protected by iterMu.
-	iterMu    sync.Mutex
+	iterMu    sync.Mutex `state:"nosave"`
 	childList dentryList
 }
 
@@ -57,8 +58,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi
 	return dir
 }
 
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
 func (dir *directory) insertChildLocked(child *dentry, name string) {
 	child.parent = &dir.dentry
 	child.name = name
@@ -85,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index b0ec177e6..e39cd305b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
@@ -39,7 +38,9 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	dir, ok := d.inode.impl.(*directory)
 	if !ok {
@@ -97,7 +98,9 @@ afterSymlink:
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(ctx, rp, d)
@@ -139,8 +142,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error)
 // doCreateAt is loosely analogous to a conjunction of Linux's
 // fs/namei.c:filename_create() and done_path_create().
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
@@ -669,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -701,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if _, err := resolveLocked(ctx, rp); err != nil {
 		return linux.Statfs{}, err
 	}
-	statfs := linux.Statfs{
-		Type:         linux.TMPFS_MAGIC,
-		BlockSize:    usermem.PageSize,
-		FragmentSize: usermem.PageSize,
-		NameLength:   linux.NAME_MAX,
-		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
-		Blocks:     0,
-		BlocksFree: 0,
-	}
-	return statfs, nil
+	return globalStatfs, nil
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -775,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
@@ -788,65 +783,68 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	}
 	switch impl := d.inode.impl.(type) {
 	case *socketFile:
+		if impl.ep == nil {
+			return nil, syserror.ECONNREFUSED
+		}
 		return impl.ep, nil
 	default:
 		return nil, syserror.ECONNREFUSED
 	}
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	return d.inode.listxattr(size)
+	return d.inode.listXattr(size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	return d.inode.getxattr(rp.Credentials(), &opts)
+	return d.inode.getXattr(rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setXattr(rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.removeXattr(rp.Credentials(), name)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
@@ -867,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 		}
 		if d.parent == nil {
 			if d.name != "" {
-				// This must be an anonymous memfd file.
+				// This file must have been created by
+				// newUnlinkedRegularFileDescription(). In Linux,
+				// mm/shmem.c:__shmem_file_setup() =>
+				// fs/file_table.c:alloc_file_pseudo() sets the created
+				// dentry's dentry_operations to anon_ops, for which d_dname ==
+				// simple_dname. fs/d_path.c:simple_dname() defines the
+				// dentry's pathname to be its name, prefixed with "/" and
+				// suffixed with " (deleted)".
 				b.PrependComponent("/" + d.name)
+				b.AppendString(" (deleted)")
 				return vfs.PrependPathSyntheticError{}
 			}
 			return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 739350cf0..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type namedPipe struct {
 	inode inode
 
@@ -28,8 +29,8 @@ type namedPipe struct {
 }
 
 // Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..2f856ce36 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,13 +158,14 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
 
 	// Create the pipe.
 	root := mntns.Root()
+	root.IncRef()
 	pop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..98680fde9 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -36,11 +36,17 @@ import (
 )
 
 // regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
-	memFile *pgalloc.MemoryFile
+	memFile *pgalloc.MemoryFile `state:"nosave"`
+
+	// memoryUsageKind is the memory accounting category under which pages backing
+	// this regularFile's contents are accounted.
+	memoryUsageKind usage.MemoryKind
 
 	// mapsMu protects mappings.
 	mapsMu sync.Mutex `state:"nosave"`
@@ -62,7 +68,7 @@ type regularFile struct {
 	writableMappingPages uint64
 
 	// dataMu protects the fields below.
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
@@ -86,14 +92,75 @@ type regularFile struct {
 
 func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
-		memFile: fs.memFile,
-		seals:   linux.F_SEAL_SEAL,
+		memFile:         fs.mfp.MemoryFile(),
+		memoryUsageKind: usage.Tmpfs,
+		seals:           linux.F_SEAL_SEAL,
 	}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
 
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+	fs, ok := mount.Filesystem().Impl().(*filesystem)
+	if !ok {
+		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+	}
+
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+	d := fs.newDentry(inode)
+	defer d.DecRef(ctx)
+	d.name = name
+
+	fd := &regularFileFD{}
+	fd.Init(&inode.locks)
+	flags := uint32(linux.O_RDWR)
+	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+	// Compare mm/shmem.c:shmem_zero_setup().
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+	if err != nil {
+		return nil, err
+	}
+	rf := fd.inode().impl.(*regularFile)
+	rf.memoryUsageKind = usage.Anonymous
+	rf.size = size
+	return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+	if err != nil {
+		return nil, err
+	}
+	if allowSeals {
+		fd.inode().impl.(*regularFile).seals = 0
+	}
+	return &fd.vfsfd, nil
+}
+
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
 func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
 		optional.End = pgend
 	}
 
-	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
@@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
 	return nil
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	// offMu serializes operations that may mutate off.
 	off   int64
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
@@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		case gap.Ok():
 			// Allocate memory for the write.
 			gapMR := gap.Range().Intersect(pgMR)
-			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
 			if err != nil {
 				retErr = err
 				goto exitLoop
diff --git a/tools/nogo/data/data.go b/pkg/sentry/fsimpl/tmpfs/save_restore.go
index eb84d0d27..b27f75cc2 100644
--- a/tools/nogo/data/data.go
+++ b/pkg/sentry/fsimpl/tmpfs/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package data contains shared data for nogo analysis.
-//
-// This is used to break a dependency cycle.
-package data
+package tmpfs
 
-// Objdump is the dumped binary under analysis.
-var Objdump string
+// afterLoad is called by stateify.
+func (rf *regularFile) afterLoad() {
+	rf.memFile = rf.inode.fs.mfp.MemoryFile()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 3ed650474..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,6 +21,8 @@ import (
 )
 
 // socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
 type socketFile struct {
 	inode inode
 	ep    transport.BoundEndpoint
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b0de5fabe..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index de2af6d01..4ce859d57 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -51,14 +51,19 @@ import (
 const Name = "tmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
-	// memFile is used to allocate pages to for regular files.
-	memFile *pgalloc.MemoryFile
+	// mfp is used to allocate memory that stores regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
 
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock time.Clock
@@ -67,9 +72,11 @@ type filesystem struct {
 	devMinor uint32
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	nextInoMinusOne uint64 // accessed using atomic memory operations
+
+	root *dentry
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -77,7 +84,12 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
 type FilesystemOpts struct {
 	// RootFileType is the FileType of the filesystem root. Valid values
 	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -95,8 +107,8 @@ type FilesystemOpts struct {
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
-	if memFileProvider == nil {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
 		panic("MemoryFileProviderFromContext returned nil")
 	}
 
@@ -170,7 +182,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 	clock := time.RealtimeClockFromContext(ctx)
 	fs := filesystem{
-		memFile:  memFileProvider.MemoryFile(),
+		mfp:      mfp,
 		clock:    clock,
 		devMinor: devMinor,
 	}
@@ -188,6 +200,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
+	fs.root = root
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
@@ -199,9 +212,61 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.mu.Lock()
+	if fs.root.inode.isDir() {
+		fs.root.releaseChildrenLocked(ctx)
+	}
+	fs.mu.Unlock()
+}
+
+// releaseChildrenLocked is called on the mount point by filesystem.Release() to
+// destroy all objects in the mount. It performs a depth-first walk of the
+// filesystem and "unlinks" everything by decrementing link counts
+// appropriately. There should be no open file descriptors when this is called,
+// so each inode should only have one outstanding reference that is removed once
+// its link count hits zero.
+//
+// Note that we do not update filesystem state precisely while tearing down (for
+// instance, the child maps are ignored)--we only care to remove all remaining
+// references so that every filesystem object gets destroyed. Also note that we
+// do not need to trigger DecRef on the mount point itself or any child mount;
+// these are taken care of by the destructor of the enclosing MountNamespace.
+//
+// Precondition: filesystem.mu is held.
+func (d *dentry) releaseChildrenLocked(ctx context.Context) {
+	dir := d.inode.impl.(*directory)
+	for _, child := range dir.childMap {
+		if child.inode.isDir() {
+			child.releaseChildrenLocked(ctx)
+			child.inode.decLinksLocked(ctx) // link for child/.
+			dir.inode.decLinksLocked(ctx)   // link for child/..
+		}
+		child.inode.decLinksLocked(ctx) // link for child
+	}
+}
+
+// immutable
+var globalStatfs = linux.Statfs{
+	Type:         linux.TMPFS_MAGIC,
+	BlockSize:    usermem.PageSize,
+	FragmentSize: usermem.PageSize,
+	NameLength:   linux.NAME_MAX,
+
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+	Blocks:          math.MaxInt64 / usermem.PageSize,
+	BlocksFree:      math.MaxInt64 / usermem.PageSize,
+	BlocksAvailable: math.MaxInt64 / usermem.PageSize,
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -281,6 +346,8 @@ func (d *dentry) Watches() *vfs.Watches {
 func (d *dentry) OnZeroWatches(context.Context) {}
 
 // inode represents a filesystem object.
+//
+// +stateify savable
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
@@ -297,12 +364,12 @@ type inode struct {
 
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
-	mu    sync.Mutex
-	mode  uint32 // file type and mode
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
+	mu    sync.Mutex `state:"nosave"`
+	mode  uint32     // file type and mode
+	nlink uint32     // protected by filesystem.mu instead of inode.mu
+	uid   uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32     // auth.KGID, but ...
+	ino   uint64     // immutable
 
 	// Linux's tmpfs has no concept of btime.
 	atime int64 // nanoseconds
@@ -340,8 +407,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
 
 // incLinksLocked increments i's link count.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -355,7 +424,9 @@ func (i *inode) incLinksLocked() {
 // decLinksLocked decrements i's link count. If the link count reaches 0, we
 // remove a reference on i as well.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
 func (i *inode) decLinksLocked(ctx context.Context) {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
@@ -594,66 +665,59 @@ func (i *inode) touchCMtime() {
 	i.mu.Unlock()
 }
 
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
 func (i *inode) touchCMtimeLocked() {
 	now := i.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&i.mtime, now)
 	atomic.StoreInt64(&i.ctime, now)
 }
 
-func (i *inode) listxattr(size uint64) ([]string, error) {
-	return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+	return i.xattrs.ListXattr(size)
 }
 
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
-	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
-	return i.xattrs.Getxattr(opts)
+	return i.xattrs.GetXattr(opts)
 }
 
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
-	}
-	return i.xattrs.Setxattr(opts)
+	return i.xattrs.SetXattr(opts)
 }
 
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
+	if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+	return i.xattrs.RemoveXattr(name)
+}
+
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We currently only support extended attributes in the user.* and
+	// trusted.* namespaces. See b/148380782.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
 	}
-	return i.xattrs.Removexattr(name)
-}
-
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
-	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
-	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
 }
 
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -693,20 +757,25 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.inode().listxattr(size)
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return globalStatfs, nil
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listXattr(size)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 
@@ -715,10 +784,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 
@@ -727,37 +796,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
 	return nil
 }
 
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
-	fs, ok := mount.Filesystem().Impl().(*filesystem)
-	if !ok {
-		panic("NewMemfd() called with non-tmpfs mount")
-	}
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
-	// S_IRWXUGO.
-	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
-	rf := inode.impl.(*regularFile)
-	if allowSeals {
-		rf.seals = 0
-	}
-
-	d := fs.newDentry(inode)
-	defer d.DecRef(ctx)
-	d.name = name
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
-	// FMODE_READ | FMODE_WRITE.
-	var fd regularFileFD
-	fd.Init(&inode.locks)
-	flags := uint32(linux.O_RDWR)
-	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-}
-
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..fc5323abc 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,11 +41,12 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
 	}
 	root := mntns.Root()
+	root.IncRef()
 	return vfsObj, root, func() {
 		root.DecRef(ctx)
 		mntns.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 28d2a4bcb..e265be0ee 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 licenses(["notice"])
 
@@ -6,6 +6,7 @@ go_library(
     name = "verity",
     srcs = [
         "filesystem.go",
+        "save_restore.go",
         "verity.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -13,11 +14,38 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
+        "//pkg/merkletree",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "verity_test",
+    srcs = [
+        "verity_test.go",
+    ],
+    library = ":verity",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 78c6074bd..81dfed266 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -15,9 +15,17 @@
 package verity
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/merkletree"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -91,10 +99,498 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 	putDentrySlice(*ds)
 }
 
-// resolveLocked resolves rp to an existing file.
-func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement resolveLocked.
-	return nil, nil
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// verifyChild verifies the hash of child against the already verified hash of
+// the parent to ensure the child is expected.  verifyChild triggers a sentry
+// panic if unexpected modifications to the file system are detected. In
+// noCrashOnVerificationFailure mode it returns a syserror instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+	// Read the offset of the child from the extended attributes of the
+	// corresponding Merkle tree file.
+	// This is the offset of the hash for child in its parent's Merkle tree
+	// file.
+	off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  child.lowerMerkleVD,
+		Start: child.lowerMerkleVD,
+	}, &vfs.GetXattrOptions{
+		Name: merkleOffsetInParentXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+	// The offset xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	offset, err := strconv.Atoi(off)
+	if err != nil {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+
+	// Open parent Merkle tree file to read and verify child's hash.
+	parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerMerkleVD,
+		Start: parent.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The parent Merkle tree file should have been created. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// dataSize is the size of raw data for the Merkle tree. For a file,
+	// dataSize is the size of the whole file. For a directory, dataSize is
+	// the size of all its children's hashes.
+	dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	parentSize, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  parentMerkleFD,
+		Ctx: ctx,
+	}
+
+	parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+	}, &vfs.StatOptions{})
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// Since we are verifying against a directory Merkle tree, buf should
+	// contain the hash of the children in the parent Merkle tree when
+	// Verify returns with success.
+	var buf bytes.Buffer
+	if _, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   &buf,
+		File:                  &fdReader,
+		Tree:                  &fdReader,
+		Size:                  int64(parentSize),
+		Name:                  parent.name,
+		Mode:                  uint32(parentStat.Mode),
+		UID:                   parentStat.UID,
+		GID:                   parentStat.GID,
+		ReadOffset:            int64(offset),
+		ReadSize:              int64(merkletree.DigestSize()),
+		Expected:              parent.hash,
+		DataAndTreeInSameFile: true,
+	}); err != nil && err != io.EOF {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+	}
+
+	// Cache child hash when it's verified the first time.
+	if len(child.hash) == 0 {
+		child.hash = buf.Bytes()
+	}
+	return child, nil
+}
+
+// verifyStat verifies the stat against the verified hash. The mode/uid/gid of
+// the file is cached after verified.
+func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+
+	fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err == syserror.ENOENT {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	if err == syserror.ENODATA {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	size, err := strconv.Atoi(merkleSize)
+	if err != nil {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd,
+		Ctx: ctx,
+	}
+
+	var buf bytes.Buffer
+	params := &merkletree.VerifyParams{
+		Out:        &buf,
+		Tree:       &fdReader,
+		Size:       int64(size),
+		Name:       d.name,
+		Mode:       uint32(stat.Mode),
+		UID:        stat.UID,
+		GID:        stat.GID,
+		ReadOffset: 0,
+		// Set read size to 0 so only the metadata is verified.
+		ReadSize:              0,
+		Expected:              d.hash,
+		DataAndTreeInSameFile: false,
+	}
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
+		params.DataAndTreeInSameFile = true
+	}
+
+	if _, err := merkletree.Verify(params); err != nil && err != io.EOF {
+		return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err))
+	}
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	d.size = uint32(size)
+	return nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		// If verity is enabled on child, we should check again whether
+		// the file and the corresponding Merkle tree are as expected,
+		// in order to catch deletion/renaming after the last time it's
+		// accessed.
+		if child.verityEnabled() {
+			vfsObj := fs.vfsfs.VirtualFilesystem()
+			// Get the path to the child dentry. This is only used
+			// to provide path information in failure case.
+			path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+			if err != nil {
+				return nil, err
+			}
+
+			childVD, err := parent.getLowerAt(ctx, vfsObj, name)
+			if err == syserror.ENOENT {
+				// The file was previously accessed. If the
+				// file does not exist now, it indicates an
+				// unexpected modification to the file system.
+				return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path))
+			}
+			if err != nil {
+				return nil, err
+			}
+			defer childVD.DecRef(ctx)
+
+			childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+			// The Merkle tree file was previous accessed. If it
+			// does not exist now, it indicates an unexpected
+			// modification to the file system.
+			if err == syserror.ENOENT {
+				return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path))
+			}
+			if err != nil {
+				return nil, err
+			}
+
+			defer childMerkleVD.DecRef(ctx)
+		}
+
+		// If enabling verification on files/directories is not allowed
+		// during runtime, all cached children are already verified. If
+		// runtime enable is allowed and the parent directory is
+		// enabled, we should verify the child hash here because it may
+		// be cached before enabled.
+		if fs.allowRuntimeEnable {
+			if parent.verityEnabled() {
+				if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+					return nil, err
+				}
+			}
+			if child.verityEnabled() {
+				vfsObj := fs.vfsfs.VirtualFilesystem()
+				mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+				stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  child.lowerVD,
+					Start: child.lowerVD,
+				}, &vfs.StatOptions{
+					Mask: mask,
+				})
+				if err != nil {
+					return nil, err
+				}
+				if err := fs.verifyStat(ctx, child, stat); err != nil {
+					return nil, err
+				}
+			}
+		}
+		return child, nil
+	}
+	child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	childVD, childErr := parent.getLowerAt(ctx, vfsObj, name)
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childErr != nil && childErr != syserror.ENOENT {
+		return nil, childErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childErr == nil {
+		defer childVD.DecRef(ctx)
+	}
+
+	childMerkleVD, childMerkleErr := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+		return nil, childMerkleErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childMerkleErr == nil {
+		defer childMerkleVD.DecRef(ctx)
+	}
+
+	// Get the path to the parent dentry. This is only used to provide path
+	// information in failure case.
+	parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO(b/166474175): Investigate all possible errors of childErr and
+	// childMerkleErr, and make sure we differentiate all errors that
+	// indicate unexpected modifications to the file system from the ones
+	// that are not harmful.
+	if childErr == syserror.ENOENT && childMerkleErr == nil {
+		// Failed to get child file/directory dentry. However the
+		// corresponding Merkle tree is found. This indicates an
+		// unexpected modification to the file system that
+		// removed/renamed the child.
+		return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+	} else if childErr == nil && childMerkleErr == syserror.ENOENT {
+		// If in allowRuntimeEnable mode, and the Merkle tree file is
+		// not created yet, we create an empty Merkle tree file, so that
+		// if the file is enabled through ioctl, we have the Merkle tree
+		// file open and ready to use.
+		// This may cause empty and unused Merkle tree files in
+		// allowRuntimeEnable mode, if they are never enabled. This
+		// does not affect verification, as we rely on cached hash to
+		// decide whether to perform verification, not the existence of
+		// the Merkle tree file. Also, those Merkle tree files are
+		// always hidden and cannot be accessed by verity fs users.
+		if fs.allowRuntimeEnable {
+			childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(merklePrefix + name),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT,
+				Mode:  0644,
+			})
+			if err != nil {
+				return nil, err
+			}
+			childMerkleFD.DecRef(ctx)
+			childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			// If runtime enable is not allowed. This indicates an
+			// unexpected modification to the file system that
+			// removed/renamed the Merkle tree file.
+			return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+		}
+	} else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+		// Both the child and the corresponding Merkle tree are missing.
+		// This could be an unexpected modification or due to incorrect
+		// parameter.
+		// TODO(b/167752508): Investigate possible ways to differentiate
+		// cases that both files are deleted from cases that they never
+		// exist in the file system.
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
+	}
+
+	mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+	stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  childVD,
+		Start: childVD,
+	}, &vfs.StatOptions{
+		Mask: mask,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	child := fs.newDentry()
+	child.lowerVD = childVD
+	child.lowerMerkleVD = childMerkleVD
+
+	// Increase the reference for both childVD and childMerkleVD as they are
+	// held by child. If this function fails and the child is destroyed, the
+	// references will be decreased in destroyLocked.
+	childVD.IncRef()
+	childMerkleVD.IncRef()
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+
+	child.mode = uint32(stat.Mode)
+	child.uid = stat.UID
+	child.gid = stat.GID
+
+	// Verify child hash. This should always be performed unless in
+	// allowRuntimeEnable mode and the parent directory hasn't been enabled
+	// yet.
+	if parent.verityEnabled() {
+		if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+	if child.verityEnabled() {
+		if err := fs.verifyStat(ctx, child, stat); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -104,8 +600,39 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 //
 // Preconditions: fs.renameMu must be locked. !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement walkParentDirLocked.
-	return nil, nil
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -179,8 +706,183 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	//TODO(b/159261227): Implement OpenAt.
-	return nil, nil
+	// Verity fs is read-only.
+	if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+		return nil, syserror.EROFS
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+	// Open existing child or follow symlink.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+	parent.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Users should not open the Merkle tree files. Those are for verity fs
+	// use only.
+	if strings.Contains(d.name, merklePrefix) {
+		return nil, syserror.EPERM
+	}
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+
+	// Verity fs is read-only.
+	if ats&vfs.MayWrite != 0 {
+		return nil, syserror.EROFS
+	}
+
+	// Get the path to the target file. This is only used to provide path
+	// information in failure case.
+	path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// Open the file in the underlying file system.
+	lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	}, opts)
+
+	// The file should exist, as we succeeded in finding its dentry. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// lowerFD needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity FD is successfully created.
+	defer lowerFD.DecRef(ctx)
+
+	// Open the Merkle tree file corresponding to the current file/directory
+	// to be used later for verifying Read/Walk.
+	merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The Merkle tree file should exist, as we succeeded in finding its
+	// dentry. If it's missing, it indicates an unexpected modification to
+	// the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// merkleReader needs to be cleaned up if any error occurs. IncRef will
+	// be called if a verity FD is successfully created.
+	defer merkleReader.DecRef(ctx)
+
+	lowerFlags := lowerFD.StatusFlags()
+	lowerFDOpts := lowerFD.Options()
+	var merkleWriter *vfs.FileDescription
+	var parentMerkleWriter *vfs.FileDescription
+
+	// Only open the Merkle tree files for write if in allowRuntimeEnable
+	// mode.
+	if d.fs.allowRuntimeEnable {
+		merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerMerkleVD,
+			Start: d.lowerMerkleVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_APPEND,
+		})
+		if err != nil {
+			if err == syserror.ENOENT {
+				return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+			}
+			return nil, err
+		}
+		// merkleWriter is cleaned up if any error occurs. IncRef will
+		// be called if a verity FD is created successfully.
+		defer merkleWriter.DecRef(ctx)
+
+		if d.parent != nil {
+			parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  d.parent.lowerMerkleVD,
+				Start: d.parent.lowerMerkleVD,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_WRONLY | linux.O_APPEND,
+			})
+			if err != nil {
+				if err == syserror.ENOENT {
+					parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+					return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+				}
+				return nil, err
+			}
+			// parentMerkleWriter is cleaned up if any error occurs. IncRef
+			// will be called if a verity FD is created successfully.
+			defer parentMerkleWriter.DecRef(ctx)
+		}
+	}
+
+	fd := &fileDescription{
+		d:                  d,
+		lowerFD:            lowerFD,
+		merkleReader:       merkleReader,
+		merkleWriter:       merkleWriter,
+		parentMerkleWriter: parentMerkleWriter,
+		isDir:              d.isDir(),
+	}
+
+	if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+		return nil, err
+	}
+	lowerFD.IncRef()
+	merkleReader.IncRef()
+	if merkleWriter != nil {
+		merkleWriter.IncRef()
+	}
+	if parentMerkleWriter != nil {
+		parentMerkleWriter.IncRef()
+	}
+	return &fd.vfsfd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -218,6 +920,8 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
+// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
+// be verified.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -235,6 +939,11 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if d.verityEnabled() {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -256,7 +965,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -267,8 +976,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -277,14 +986,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 		return nil, err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -293,20 +1002,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		return "", err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go
new file mode 100644
index 000000000..4a161163c
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "verity.dentry")
+	}
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index cb29d33a5..e2cbb206f 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,29 +22,62 @@
 package verity
 
 import (
+	"fmt"
+	"math"
+	"strconv"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/merkletree"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Name is the default filesystem name.
 const Name = "verity"
 
-// testOnlyDebugging allows verity file system to return error instead of
-// crashing the application when a malicious action is detected. This should
-// only be set for tests.
-var testOnlyDebugging bool
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -69,11 +102,24 @@ type filesystem struct {
 	// renameMu synchronizes renaming with non-renaming operations in order
 	// to ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
+
+	// verityMu synchronizes enabling verity files, protects files or
+	// directories from being enabled by different threads simultaneously.
+	// It also ensures that verity does not access files that are being
+	// enabled.
+	//
+	// Also, the directory Merkle trees depends on the generated trees of
+	// its children. So they shouldn't be enabled the same time. This lock
+	// is for the whole file system to ensure that no more than one file is
+	// enabled the same time.
+	verityMu sync.RWMutex
 }
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
 	// RootMerkleFileName is the name of the verity root Merkle tree file.
 	RootMerkleFileName string
@@ -93,10 +139,10 @@ type InternalFilesystemOptions struct {
 	// system wrapped by verity file system.
 	LowerGetFSOptions vfs.GetFilesystemOptions
 
-	// TestOnlyDebugging allows verity file system to return error instead
-	// of crashing the application when a malicious action is detected. This
-	// should only be set for tests.
-	TestOnlyDebugging bool
+	// NoCrashOnVerificationFailure indicates whether the sandbox should
+	// panic whenever verification fails. If true, an error is returned
+	// instead of panicking. This should only be set for tests.
+	NoCrashOnVerificationFailure bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -104,10 +150,129 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns EIO, otherwise it panic.
+func alertIntegrityViolation(msg string) error {
+	if noCrashOnVerificationFailure {
+		return syserror.EIO
+	}
+	panic(msg)
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	//TODO(b/159261227): Implement GetFilesystem.
-	return nil, nil, nil
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if !ok {
+		ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+		return nil, nil, syserror.EINVAL
+	}
+	noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+	// Mount the lower file system. The lower file system is wrapped inside
+	// verity, and should not be exposed or connected.
+	mopts := &vfs.MountOptions{
+		GetFilesystemOptions: iopts.LowerGetFSOptions,
+		InternalMount:        true,
+	}
+	mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		creds:              creds.Fork(),
+		lowerMount:         mnt,
+		allowRuntimeEnable: iopts.AllowRuntimeEnable,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	d := fs.newDentry()
+	d.refs = 1
+	lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+	lowerVD.IncRef()
+	d.lowerVD = lowerVD
+
+	rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+	lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+		Path:  fspath.Parse(rootMerkleName),
+	}, &vfs.GetDentryOptions{})
+
+	// If runtime enable is allowed, the root merkle tree may be absent. We
+	// should create the tree file.
+	if err == syserror.ENOENT && fs.allowRuntimeEnable {
+		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDWR | linux.O_CREAT,
+			Mode:  0644,
+		})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+		lowerMerkleFD.DecRef(ctx)
+		lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.GetDentryOptions{})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+	} else if err != nil {
+		// Failed to get dentry for the root Merkle file. This
+		// indicates an unexpected modification that removed/renamed
+		// the root Merkle file, or it's never generated.
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, alertIntegrityViolation("Failed to find root Merkle file")
+	}
+	d.lowerMerkleVD = lowerMerkleVD
+
+	// Get metadata from the underlying file system.
+	const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.StatOptions{
+		Mask: statMask,
+	})
+	if err != nil {
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, err
+	}
+
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	d.hash = make([]byte, len(iopts.RootHash))
+
+	if !fs.allowRuntimeEnable {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	copy(d.hash, iopts.RootHash)
+	d.vfsd.Init(d)
+
+	fs.rootDentry = d
+
+	return &fs.vfsfs, &d.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -116,6 +281,8 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -124,11 +291,12 @@ type dentry struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
 
-	// mode, uid and gid are the file mode, owner, and group of the file in
-	// the underlying file system.
+	// mode, uid, gid and size are the file mode, owner, group, and size of
+	// the file in the underlying file system.
 	mode uint32
 	uid  uint32
 	gid  uint32
+	size uint32
 
 	// parent is the dentry corresponding to this dentry's parent directory.
 	// name is this dentry's name in parent. If this dentry is a filesystem
@@ -142,7 +310,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 
 	// lowerVD is the VirtualDentry in the underlying file system.
@@ -152,8 +320,8 @@ type dentry struct {
 	// in the underlying file system.
 	lowerMerkleVD vfs.VirtualDentry
 
-	// rootHash is the rootHash for the current file or directory.
-	rootHash []byte
+	// hash is the calculated hash for the current file or directory.
+	hash []byte
 }
 
 // newDentry creates a new dentry representing the given verity file. The
@@ -166,6 +334,9 @@ func (fs *filesystem) newDentry() *dentry {
 		fs: fs,
 	}
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "verity.dentry")
+	}
 	return d
 }
 
@@ -228,6 +399,9 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	if d.lowerVD.Ok() {
 		d.lowerVD.DecRef(ctx)
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "verity.dentry")
+	}
 
 	if d.lowerMerkleVD.Ok() {
 		d.lowerMerkleVD.DecRef(ctx)
@@ -247,6 +421,11 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	}
 }
 
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
 	//TODO(b/159261227): Implement InotifyWithParent.
@@ -275,6 +454,24 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+	return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
+// getLowerAt returns the dentry in the underlying file system, which is
+// represented by filename relative to d.
+func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) {
+	return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(filename),
+	}, &vfs.GetDentryOptions{})
+}
+
 func (d *dentry) readlink(ctx context.Context) (string, error) {
 	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  d.lowerVD,
@@ -286,6 +483,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
 // FileDescription is a wrapper of the underlying lowerFD, with support to build
 // Merkle trees through the Linux fs-verity API to verify contents read from
 // lowerFD.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -314,6 +513,10 @@ type fileDescription struct {
 	// directory that contains the current file/directory. This is only used
 	// if allowRuntimeEnable is set to true.
 	parentMerkleWriter *vfs.FileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex `state:"nosave"`
+	off int64
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
@@ -335,6 +538,11 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if fd.d.verityEnabled() {
+		if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -344,12 +552,325 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return syserror.EPERM
 }
 
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	n := int64(0)
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		n = fd.off
+	case linux.SEEK_END:
+		n = int64(fd.d.size)
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset > math.MaxInt64-n {
+		return 0, syserror.EINVAL
+	}
+	offset += n
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
+// of the generated Merkle tree and the data size is returned.  If fd points to
+// a regular file, the data is the content of the file. If fd points to a
+// directory, the data is all hahes of its children, written to the Merkle tree
+// file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+	merkleWriter := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleWriter,
+		Ctx: ctx,
+	}
+	params := &merkletree.GenerateParams{
+		TreeReader: &merkleReader,
+		TreeWriter: &merkleWriter,
+	}
+
+	switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+	case linux.S_IFREG:
+		// For a regular file, generate a Merkle tree based on its
+		// content.
+		var err error
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &fdReader
+		params.Size = int64(stat.Size)
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = false
+	case linux.S_IFDIR:
+		// For a directory, generate a Merkle tree based on the hashes
+		// of its children that has already been written to the Merkle
+		// tree file.
+		merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.Size = int64(merkleStat.Size)
+
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &merkleReader
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = true
+	default:
+		// TODO(b/167728857): Investigate whether and how we should
+		// enable other types of file.
+		return nil, 0, syserror.EINVAL
+	}
+	hash, err := merkletree.Generate(params)
+	return hash, uint64(params.Size), err
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
+	if !fd.d.fs.allowRuntimeEnable {
+		return 0, syserror.EPERM
+	}
+
+	fd.d.fs.verityMu.Lock()
+	defer fd.d.fs.verityMu.Unlock()
+
+	// In allowRuntimeEnable mode, the underlying fd and read/write fd for
+	// the Merkle tree file should have all been initialized. For any file
+	// or directory other than the root, the parent Merkle tree file should
+	// have also been initialized.
+	if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+		return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
+	}
+
+	hash, dataSize, err := fd.generateMerkle(ctx)
+	if err != nil {
+		return 0, err
+	}
+
+	if fd.parentMerkleWriter != nil {
+		stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return 0, err
+		}
+
+		// Write the hash of fd to the parent directory's Merkle tree
+		// file, as it should be part of the parent Merkle tree data.
+		// parentMerkleWriter is open with O_APPEND, so it should write
+		// directly to the end of the file.
+		if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
+			return 0, err
+		}
+
+		// Record the offset of the hash of fd in parent directory's
+		// Merkle tree file.
+		if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+			Name:  merkleOffsetInParentXattr,
+			Value: strconv.Itoa(int(stat.Size)),
+		}); err != nil {
+			return 0, err
+		}
+	}
+
+	// Record the size of the data being hashed for fd.
+	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+		Name:  merkleSizeXattr,
+		Value: strconv.Itoa(int(dataSize)),
+	}); err != nil {
+		return 0, err
+	}
+	fd.d.hash = append(fd.d.hash, hash...)
+	return 0, nil
+}
+
+// measureVerity returns the hash of fd, saved in verityDigest.
+func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		return 0, syserror.EINVAL
+	}
+	var metadata linux.DigestMetadata
+
+	// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
+	// verity is not enabled for the file. If allowRuntimeEnable is false,
+	// this is an integrity violation because all files should have verity
+	// enabled, in which case fd.d.hash should be set.
+	if len(fd.d.hash) == 0 {
+		if fd.d.fs.allowRuntimeEnable {
+			return 0, syserror.ENODATA
+		}
+		return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found")
+	}
+
+	// The first part of VerityDigest is the metadata.
+	if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+		return 0, err
+	}
+	if metadata.DigestSize < uint16(len(fd.d.hash)) {
+		return 0, syserror.EOVERFLOW
+	}
+
+	// Populate the output digest size, since DigestSize is both input and
+	// output.
+	metadata.DigestSize = uint16(len(fd.d.hash))
+
+	// First copy the metadata.
+	if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+		return 0, err
+	}
+
+	// Now copy the root hash bytes to the memory after metadata.
+	_, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
+	return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
+	f := int32(0)
+
+	// All enabled files should store a hash. This flag is not settable via
+	// FS_IOC_SETFLAGS.
+	if len(fd.d.hash) != 0 {
+		f |= linux.FS_VERITY_FL
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		return 0, syserror.EINVAL
+	}
+	_, err := primitive.CopyInt32Out(t, flags, f)
+	return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FS_IOC_ENABLE_VERITY:
+		return fd.enableVerity(ctx, uio)
+	case linux.FS_IOC_MEASURE_VERITY:
+		return fd.measureVerity(ctx, uio, args[2].Pointer())
+	case linux.FS_IOC_GETFLAGS:
+		return fd.verityFlags(ctx, uio, args[2].Pointer())
+	default:
+		// TODO(b/169682228): Investigate which ioctl commands should
+		// be allowed.
+		return 0, syserror.ENOSYS
+	}
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Implement Read with PRead by setting offset.
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// No need to verify if the file is not enabled yet in
+	// allowRuntimeEnable mode.
+	if !fd.d.verityEnabled() {
+		return fd.lowerFD.PRead(ctx, dst, offset, opts)
+	}
+
+	fd.d.fs.verityMu.RLock()
+	defer fd.d.fs.verityMu.RUnlock()
+	// dataSize is the size of the whole file.
+	dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the xattr does not exist, it
+	// indicates unexpected modifications to the file system.
+	if err == syserror.ENODATA {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	size, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+	}
+
+	dataReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+
+	n, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   dst.Writer(ctx),
+		File:                  &dataReader,
+		Tree:                  &merkleReader,
+		Size:                  int64(size),
+		Name:                  fd.d.name,
+		Mode:                  fd.d.mode,
+		UID:                   fd.d.uid,
+		GID:                   fd.d.gid,
+		ReadOffset:            offset,
+		ReadSize:              dst.NumBytes(),
+		Expected:              fd.d.hash,
+		DataAndTreeInSameFile: false,
+	})
+	if err != nil {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+	}
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EROFS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EROFS
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+	return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block)
 }
 
 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+	return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
new file mode 100644
index 000000000..c647cbfd3
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -0,0 +1,700 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rootMerkleFilename is the name of the root Merkle tree file.
+const rootMerkleFilename = "root.verity"
+
+// maxDataSize is the maximum data size written to the file for test.
+const maxDataSize = 100000
+
+// newVerityRoot creates a new verity mount, and returns the root. The
+// underlying file system is tmpfs. If the error is not nil, then cleanup
+// should be called when the root is no longer needed.
+func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("testutil.Boot: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+
+	rand.Seed(time.Now().UnixNano())
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
+	vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: InternalFilesystemOptions{
+				RootMerkleFileName:           rootMerkleFilename,
+				LowerName:                    "tmpfs",
+				AllowRuntimeEnable:           true,
+				NoCrashOnVerificationFailure: true,
+			},
+		},
+	})
+	if err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("NewMountNamespace: %v", err)
+	}
+	root := mntns.Root()
+	root.IncRef()
+
+	// Use lowerRoot in the task as we modify the lower file system
+	// directly in many tests.
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	task, err := testutil.CreateTask(ctx, "name", tc, mntns, lowerRoot, lowerRoot)
+	if err != nil {
+		t.Fatalf("testutil.CreateTask: %v", err)
+	}
+
+	t.Helper()
+	t.Cleanup(func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	})
+	return vfsObj, root, task, nil
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+
+	// Create the file in the underlying file system.
+	lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  linux.ModeRegular | mode,
+	})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// Generate random data to be written to the file.
+	dataSize := rand.Intn(maxDataSize) + 1
+	data := make([]byte, dataSize)
+	rand.Read(data)
+
+	// Write directly to the underlying FD, since verity FD is read-only.
+	n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	if n != int64(len(data)) {
+		return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data))
+	}
+
+	lowerFD.DecRef(ctx)
+
+	// Now open the verity file descriptor.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular | mode,
+	})
+	return fd, dataSize, err
+}
+
+// corruptRandomBit randomly flips a bit in the file represented by fd.
+func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
+	// Flip a random bit in the underlying file.
+	randomPos := int64(rand.Intn(size))
+	byteToModify := make([]byte, 1)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PRead: %v", err)
+	}
+	byteToModify[0] ^= 1
+	if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PWrite: %v", err)
+	}
+	return nil
+}
+
+// TestOpen ensures that when a file is created, the corresponding Merkle tree
+// file and the root Merkle tree file exist.
+func TestOpen(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Ensure that the corresponding Merkle tree file is created.
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
+	}
+
+	// Ensure the root merkle tree file is created.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + rootMerkleFilename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+	}
+}
+
+// TestPReadUnmodifiedFileSucceeds ensures that pread from an untouched verity
+// file succeeds after enabling verity for it.
+func TestPReadUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirm a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	buf := make([]byte, size)
+	n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead: %v", err)
+	}
+
+	if n != int64(size) {
+		t.Errorf("fd.PRead got read length %d, want %d", n, size)
+	}
+}
+
+// TestReadUnmodifiedFileSucceeds ensures that read from an untouched verity
+// file succeeds after enabling verity for it.
+func TestReadUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirm a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	buf := make([]byte, size)
+	n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read: %v", err)
+	}
+
+	if n != int64(size) {
+		t.Errorf("fd.PRead got read length %d, want %d", n, size)
+	}
+}
+
+// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file
+// succeeds after enabling verity for it.
+func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Ensure reopening the verity enabled file succeeds.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err != nil {
+		t.Errorf("reopen enabled file failed: %v", err)
+	}
+}
+
+// TestPReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestPReadModifiedFileFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerFD that's read/writable.
+	lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+	lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from the modified file fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		t.Fatalf("fd.PRead succeeded, expected failure")
+	}
+}
+
+// TestReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestReadModifiedFileFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerFD that's read/writable.
+	lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+	lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from the modified file fails.
+	buf := make([]byte, size)
+	if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil {
+		t.Fatalf("fd.Read succeeded, expected failure")
+	}
+}
+
+// TestModifiedMerkleFails ensures that read from a verity file fails if the
+// corresponding Merkle tree file is modified.
+func TestModifiedMerkleFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+	lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerMerkleVD,
+		Start: lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the Merkle tree file.
+	stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	merkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from a file with modified Merkle tree fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		fmt.Println(buf)
+		t.Fatalf("fd.PRead succeeded with modified Merkle file")
+	}
+}
+
+// TestModifiedParentMerkleFails ensures that open a verity enabled file in a
+// verity enabled directory fails if the hashes related to the target file in
+// the parent Merkle tree file is modified.
+func TestModifiedParentMerkleFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Enable verity on the parent directory.
+	parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
+
+	parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  parentLowerMerkleVD,
+		Start: parentLowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the parent Merkle tree file.
+	// This parent directory contains only one child, so any random
+	// modification in the parent Merkle tree should cause verification
+	// failure when opening the child file.
+	stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	parentMerkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	parentLowerMerkleFD.DecRef(ctx)
+
+	// Ensure reopening the verity enabled file fails.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err == nil {
+		t.Errorf("OpenAt file with modified parent Merkle succeeded")
+	}
+}
+
+// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file
+// succeeds after enabling verity for it.
+func TestUnmodifiedStatSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms stat succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
+		t.Errorf("fd.Stat: %v", err)
+	}
+}
+
+// TestModifiedStatFails checks that getting stat for a file with modified stat
+// should fail.
+func TestModifiedStatFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	lowerFD := fd.Impl().(*fileDescription).lowerFD
+	// Change the stat of the underlying file, and check that stat fails.
+	if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: uint32(linux.STATX_MODE),
+			Mode: 0777,
+		},
+	}); err != nil {
+		t.Fatalf("lowerFD.SetStat: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
+		t.Errorf("fd.Stat succeeded when it should fail")
+	}
+}
+
+// TestOpenDeletedOrRenamedFileFails ensures that opening a deleted/renamed
+// verity enabled file or the corresponding Merkle tree file fails with the
+// verify error.
+func TestOpenDeletedFileFails(t *testing.T) {
+	testCases := []struct {
+		// Tests removing files is remove is true. Otherwise tests
+		// renaming files.
+		remove bool
+		// The original file is removed/renamed if changeFile is true.
+		changeFile bool
+		// The Merkle tree file is removed/renamed if changeMerkleFile
+		// is true.
+		changeMerkleFile bool
+	}{
+		{
+			remove:           true,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+		{
+			remove:           true,
+			changeFile:       false,
+			changeMerkleFile: true,
+		},
+		{
+			remove:           false,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+		{
+			remove:           false,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) {
+			vfsObj, root, ctx, err := newVerityRoot(t)
+			if err != nil {
+				t.Fatalf("newVerityRoot: %v", err)
+			}
+
+			filename := "verity-test-file"
+			fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+			if err != nil {
+				t.Fatalf("newFileFD: %v", err)
+			}
+
+			// Enable verity on the file.
+			var args arch.SyscallArguments
+			args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+			if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+				t.Fatalf("Ioctl: %v", err)
+			}
+
+			rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
+			if tc.remove {
+				if tc.changeFile {
+					if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(filename),
+					}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+				if tc.changeMerkleFile {
+					if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + filename),
+					}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+			} else {
+				newFilename := "renamed-test-file"
+				if tc.changeFile {
+					if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(filename),
+					}, &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(newFilename),
+					}, &vfs.RenameOptions{}); err != nil {
+						t.Fatalf("RenameAt: %v", err)
+					}
+				}
+				if tc.changeMerkleFile {
+					if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + filename),
+					}, &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + newFilename),
+					}, &vfs.RenameOptions{}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+			}
+
+			// Ensure reopening the verity enabled file fails.
+			if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(filename),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDONLY,
+				Mode:  linux.ModeRegular,
+			}); err != syserror.EIO {
+				t.Errorf("got OpenAt error: %v, expected EIO", err)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index 61c78569d..300b7ccce 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -7,11 +7,14 @@ go_library(
     srcs = [
         "cgroup.go",
         "hostmm.go",
+        "membarrier.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/hostmm/membarrier.go b/pkg/sentry/hostmm/membarrier.go
new file mode 100644
index 000000000..4468d75f1
--- /dev/null
+++ b/pkg/sentry/hostmm/membarrier.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+var (
+	haveMembarrierGlobal           = false
+	haveMembarrierPrivateExpedited = false
+)
+
+func init() {
+	supported, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */)
+	if e != 0 {
+		if e != syscall.ENOSYS {
+			log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error())
+		}
+		return
+	}
+	// We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to
+	// all CPUs running tasks that have previously invoked
+	// MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk.
+	// (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU
+	// grace period to elapse without bothering other CPUs.
+	// MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks
+	// sharing the caller's MM.)
+	if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 {
+		haveMembarrierGlobal = true
+	}
+	if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req {
+		if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+			log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error())
+		} else {
+			haveMembarrierPrivateExpedited = true
+		}
+	}
+}
+
+// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported.
+func HaveGlobalMemoryBarrier() bool {
+	return haveMembarrierGlobal
+}
+
+// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have
+// passed through a state where all memory accesses to user-space addresses
+// match program order between entry to and return from [GlobalMemoryBarrier]",
+// as for membarrier(2).
+//
+// Preconditions: HaveGlobalMemoryBarrier() == true.
+func GlobalMemoryBarrier() error {
+	if _, _, e := syscall.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 {
+		return e
+	}
+	return nil
+}
+
+// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported.
+func HaveProcessMemoryBarrier() bool {
+	return haveMembarrierPrivateExpedited
+}
+
+// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only
+// synchronizes with threads sharing a virtual address space (from the host OS'
+// perspective) with the calling thread.
+//
+// Preconditions: HaveProcessMemoryBarrier() == true.
+func ProcessMemoryBarrier() error {
+	if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
+		return e
+	}
+	return nil
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 07bf39fed..5bba9de0b 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -15,6 +15,7 @@ go_library(
     ],
     deps = [
         "//pkg/context",
+        "//pkg/tcpip",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index c0b4831d1..f31277d30 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -15,7 +15,10 @@
 // Package inet defines semantics for IP stacks.
 package inet
 
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
 
 // Stack represents a TCP/IP stack.
 type Stack interface {
@@ -29,9 +32,13 @@ type Stack interface {
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
 	// AddInterfaceAddr adds an address to the network interface identified by
-	// index.
+	// idx.
 	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
 
+	// RemoveInterfaceAddr removes an address from the network interface
+	// identified by idx.
+	RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
@@ -80,6 +87,12 @@ type Stack interface {
 	// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
 	// for restoring a stack after a save.
 	RestoreCleanupEndpoints([]stack.TransportEndpoint)
+
+	// Forwarding returns if packet forwarding between NICs is enabled.
+	Forwarding(protocol tcpip.NetworkProtocolNumber) bool
+
+	// SetForwarding enables or disables packet forwarding between NICs.
+	SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error
 }
 
 // Interface contains information about a network interface.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 9771f01fc..9ebeba8a3 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -14,7 +14,13 @@
 
 package inet
 
-import "gvisor.dev/gvisor/pkg/tcpip/stack"
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
 
 // TestStack is a dummy implementation of Stack for tests.
 type TestStack struct {
@@ -26,6 +32,7 @@ type TestStack struct {
 	TCPSendBufSize    TCPBufferSize
 	TCPSACKFlag       bool
 	Recovery          TCPLossRecovery
+	IPForwarding      bool
 }
 
 // NewTestStack returns a TestStack with no network interfaces. The value of
@@ -54,6 +61,24 @@ func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
 	return nil
 }
 
+// RemoveInterfaceAddr implements Stack.RemoveInterfaceAddr.
+func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	interfaceAddrs, ok := s.InterfaceAddrsMap[idx]
+	if !ok {
+		return fmt.Errorf("unknown idx: %d", idx)
+	}
+
+	var filteredAddrs []InterfaceAddr
+	for _, interfaceAddr := range interfaceAddrs {
+		if !bytes.Equal(interfaceAddr.Addr, addr.Addr) {
+			filteredAddrs = append(filteredAddrs, addr)
+		}
+	}
+	s.InterfaceAddrsMap[idx] = filteredAddrs
+
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
@@ -128,3 +153,14 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint {
 
 // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
 func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	return s.IPForwarding
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+	s.IPForwarding = enable
+	return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 5416a310d..90dd4a047 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -69,8 +69,63 @@ go_template_instance(
     prefix = "socket",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*SocketEntry",
-        "Linker": "*SocketEntry",
+        "Element": "*SocketRecordVFS1",
+        "Linker": "*SocketRecordVFS1",
+    },
+)
+
+go_template_instance(
+    name = "fd_table_refs",
+    out = "fd_table_refs.go",
+    package = "kernel",
+    prefix = "FDTable",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "FDTable",
+    },
+)
+
+go_template_instance(
+    name = "fs_context_refs",
+    out = "fs_context_refs.go",
+    package = "kernel",
+    prefix = "FSContext",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "FSContext",
+    },
+)
+
+go_template_instance(
+    name = "ipc_namespace_refs",
+    out = "ipc_namespace_refs.go",
+    package = "kernel",
+    prefix = "IPCNamespace",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "IPCNamespace",
+    },
+)
+
+go_template_instance(
+    name = "process_group_refs",
+    out = "process_group_refs.go",
+    package = "kernel",
+    prefix = "ProcessGroup",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "session_refs",
+    out = "session_refs.go",
+    package = "kernel",
+    prefix = "Session",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "Session",
     },
 )
 
@@ -88,9 +143,14 @@ go_library(
         "aio.go",
         "context.go",
         "fd_table.go",
+        "fd_table_refs.go",
         "fd_table_unsafe.go",
         "fs_context.go",
+        "fs_context_refs.go",
         "ipc_namespace.go",
+        "ipc_namespace_refs.go",
+        "kcov.go",
+        "kcov_unsafe.go",
         "kernel.go",
         "kernel_opts.go",
         "kernel_state.go",
@@ -99,6 +159,7 @@ go_library(
         "pending_signals_state.go",
         "posixtimer.go",
         "process_group_list.go",
+        "process_group_refs.go",
         "ptrace.go",
         "ptrace_amd64.go",
         "ptrace_arm64.go",
@@ -106,6 +167,7 @@ go_library(
         "seccomp.go",
         "seqatomic_taskgoroutineschedinfo_unsafe.go",
         "session_list.go",
+        "session_refs.go",
         "sessions.go",
         "signal.go",
         "signal_handlers.go",
@@ -147,23 +209,27 @@ go_library(
         "gvisor.dev/gvisor/pkg/sentry/device",
         "gvisor.dev/gvisor/pkg/tcpip",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         ":uncaught_signal_go_proto",
         "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/amutex",
-        "//pkg/binary",
         "//pkg/bits",
         "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
+        "//pkg/coverage",
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/refs",
-        "//pkg/refs_vfs2",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/secio",
         "//pkg/sentry/arch",
@@ -210,7 +276,6 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 1b9721534..0ddbe5ff6 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs_vfs2"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 )
@@ -27,7 +27,7 @@ import (
 // +stateify savable
 type abstractEndpoint struct {
 	ep     transport.BoundEndpoint
-	socket refs_vfs2.RefCounter
+	socket refsvfs2.RefCounter
 	name   string
 	ns     *AbstractSocketNamespace
 }
@@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
 // its backing socket.
 type boundEndpoint struct {
 	transport.BoundEndpoint
-	socket refs_vfs2.RefCounter
+	socket refsvfs2.RefCounter
 }
 
 // Release implements transport.BoundEndpoint.Release.
@@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
 //
 // When the last reference managed by socket is dropped, ep may be removed from the
 // namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
@@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran
 
 // Remove removes the specified socket at name from the abstract socket
 // namespace, if it has not yet been replaced.
-func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) {
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 2bc49483a..869e49ebc 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -57,6 +57,7 @@ go_library(
         "id_map_set.go",
         "user_namespace.go",
     ],
+    marshal = True,
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index ef5723127..c08d47787 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials {
 	}
 	return NewAnonymousCredentials()
 }
+
+// ContextWithCredentials returns a copy of ctx carrying creds.
+func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
+	return &authContext{ctx, creds}
+}
+
+type authContext struct {
+	context.Context
+	creds *Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 0a58ba17c..4c32ee703 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -19,9 +19,13 @@ import (
 )
 
 // UID is a user ID in an unspecified user namespace.
+//
+// +marshal
 type UID uint32
 
 // GID is a group ID in an unspecified user namespace.
+//
+// +marshal slice:GIDSlice
 type GID uint32
 
 // In the root user namespace, user/group IDs have a 1-to-1 relationship with
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index dd5f0f5fa..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
 }
 
 // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
 func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
 	if v := ctx.Value(CtxIPCNamespace); v != nil {
 		return v.(*IPCNamespace)
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ce53af69b..7aba31587 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -78,7 +77,8 @@ type descriptor struct {
 //
 // +stateify savable
 type FDTable struct {
-	refs.AtomicRefCount
+	FDTableRefs
+
 	k *Kernel
 
 	// mu protects below.
@@ -110,9 +110,12 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
 
 func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 	ctx := context.Background()
-	f.init() // Initialize table.
+	f.initNoLeakCheck() // Initialize table.
+	f.used = 0
 	for fd, d := range m {
-		f.setAll(fd, d.file, d.fileVFS2, d.flags)
+		if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
+			panic("VFS1 or VFS2 files set")
+		}
 
 		// Note that we do _not_ need to acquire a extra table reference here. The
 		// table reference will already be accounted for in the file, so we drop the
@@ -127,7 +130,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 }
 
 // drop drops the table reference.
-func (f *FDTable) drop(file *fs.File) {
+func (f *FDTable) drop(ctx context.Context, file *fs.File) {
 	// Release locks.
 	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
 
@@ -145,14 +148,13 @@ func (f *FDTable) drop(file *fs.File) {
 	d.InotifyEvent(ev, 0)
 
 	// Drop the table reference.
-	file.DecRef(context.Background())
+	file.DecRef(ctx)
 }
 
 // dropVFS2 drops the table reference.
-func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
 	// Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the
 	// entire file.
-	ctx := context.Background()
 	err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET)
 	if err != nil && err != syserror.ENOLCK {
 		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
@@ -176,22 +178,15 @@ func (k *Kernel) NewFDTable() *FDTable {
 	return f
 }
 
-// destroy removes all of the file descriptors from the map.
-func (f *FDTable) destroy(ctx context.Context) {
-	f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
-		return true
-	})
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
+// DecRef implements RefCounter.DecRef.
+//
+// If f reaches zero references, all of its file descriptors are removed.
 func (f *FDTable) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
-}
-
-// Size returns the number of file descriptor slots currently allocated.
-func (f *FDTable) Size() int {
-	size := atomic.LoadInt32(&f.used)
-	return int(size)
+	f.FDTableRefs.DecRef(func() {
+		f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
+			return true
+		})
+	})
 }
 
 // forEach iterates over all non-nil files in sorted order.
@@ -245,6 +240,10 @@ func (f *FDTable) String() string {
 
 		case fileVFS2 != nil:
 			vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+			vd := fileVFS2.VirtualDentry()
+			if vd.Dentry() == nil {
+				panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2))
+			}
 			name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
 			if err != nil {
 				fmt.Fprintf(&buf, "<err: %v>\n", err)
@@ -280,7 +279,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// From f.next to find available fd.
 	if fd < f.next {
@@ -290,15 +288,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.get(i); d == nil {
-			f.set(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)             // Record the file descriptor.
+			// Set the descriptor.
+			f.set(ctx, i, files[len(fds)], flags)
+			fds = append(fds, i) // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.set(i, nil, FDFlags{}) // Zap entry.
+			f.set(ctx, i, nil, FDFlags{})
+		}
+		f.mu.Unlock()
+
+		// Drop the reference taken by the call to f.set() that
+		// originally installed the file. Don't call f.drop()
+		// (generating inotify events, etc.) since the file should
+		// appear to have never been inserted into f.
+		for _, file := range files[:len(fds)] {
+			file.DecRef(ctx)
 		}
 		return nil, syscall.EMFILE
 	}
@@ -308,6 +316,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 		f.next = fds[len(fds)-1] + 1
 	}
 
+	f.mu.Unlock()
 	return fds, nil
 }
 
@@ -335,7 +344,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// From f.next to find available fd.
 	if fd < f.next {
@@ -345,15 +353,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 	// Install all entries.
 	for i := fd; i < end && len(fds) < len(files); i++ {
 		if d, _, _ := f.getVFS2(i); d == nil {
-			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
-			fds = append(fds, i)                 // Record the file descriptor.
+			// Set the descriptor.
+			f.setVFS2(ctx, i, files[len(fds)], flags)
+			fds = append(fds, i) // Record the file descriptor.
 		}
 	}
 
 	// Failure? Unwind existing FDs.
 	if len(fds) < len(files) {
 		for _, i := range fds {
-			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+			f.setVFS2(ctx, i, nil, FDFlags{})
+		}
+		f.mu.Unlock()
+
+		// Drop the reference taken by the call to f.setVFS2() that
+		// originally installed the file. Don't call f.dropVFS2()
+		// (generating inotify events, etc.) since the file should
+		// appear to have never been inserted into f.
+		for _, file := range files[:len(fds)] {
+			file.DecRef(ctx)
 		}
 		return nil, syscall.EMFILE
 	}
@@ -363,6 +381,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes
 		f.next = fds[len(fds)-1] + 1
 	}
 
+	f.mu.Unlock()
 	return fds, nil
 }
 
@@ -398,7 +417,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 	}
 	for fd < end {
 		if d, _, _ := f.getVFS2(fd); d == nil {
-			f.setVFS2(fd, file, flags)
+			f.setVFS2(ctx, fd, file, flags)
 			if fd == f.next {
 				// Update next search start position.
 				f.next = fd + 1
@@ -414,40 +433,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc
 // reference for that FD, the ref count for that existing reference is
 // decremented.
 func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
-	return f.newFDAt(ctx, fd, file, nil, flags)
+	df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
+	if err != nil {
+		return err
+	}
+	if df != nil {
+		f.drop(ctx, df)
+	}
+	return nil
 }
 
 // NewFDAtVFS2 sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
 func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
-	return f.newFDAt(ctx, fd, nil, file, flags)
+	_, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
+	if err != nil {
+		return err
+	}
+	if dfVFS2 != nil {
+		f.dropVFS2(ctx, dfVFS2)
+	}
+	return nil
 }
 
-func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
 	if fd < 0 {
 		// Don't accept negative FDs.
-		return syscall.EBADF
+		return nil, nil, syscall.EBADF
 	}
 
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
-			return syscall.EMFILE
+			return nil, nil, syscall.EMFILE
 		}
 	}
 
 	// Install the entry.
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.setAll(fd, file, fileVFS2, flags)
-	return nil
+
+	df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
+	return df, dfVFS2, nil
 }
 
 // SetFlags sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -463,14 +497,14 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.set(fd, file, flags)
+	f.set(ctx, fd, file, flags)
 	return nil
 }
 
 // SetFlagsVFS2 sets the flags for the given file descriptor.
 //
 // True is returned iff flags were changed.
-func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
+func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
 	if fd < 0 {
 		// Don't accept negative FDs.
 		return syscall.EBADF
@@ -486,7 +520,7 @@ func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error {
 	}
 
 	// Update the flags.
-	f.setVFS2(fd, file, flags)
+	f.setVFS2(ctx, fd, file, flags)
 	return nil
 }
 
@@ -552,30 +586,6 @@ func (f *FDTable) GetFDs(ctx context.Context) []int32 {
 	return fds
 }
 
-// GetRefs returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefs(ctx context.Context) []*fs.File {
-	files := make([]*fs.File, 0, f.Size())
-	f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-		file.IncRef() // Acquire a reference for caller.
-		files = append(files, file)
-	})
-	return files
-}
-
-// GetRefsVFS2 returns a stable slice of references to all files and bumps the
-// reference count on each. The caller must use DecRef on each reference when
-// they're done using the slice.
-func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription {
-	files := make([]*vfs.FileDescription, 0, f.Size())
-	f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
-		file.IncRef() // Acquire a reference for caller.
-		files = append(files, file)
-	})
-	return files
-}
-
 // Fork returns an independent FDTable.
 func (f *FDTable) Fork(ctx context.Context) *FDTable {
 	clone := f.k.NewFDTable()
@@ -583,11 +593,8 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 	f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		// The set function here will acquire an appropriate table
 		// reference for the clone. We don't need anything else.
-		switch {
-		case file != nil:
-			clone.set(fd, file, flags)
-		case fileVFS2 != nil:
-			clone.setVFS2(fd, fileVFS2, flags)
+		if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
+			panic("VFS1 or VFS2 files set")
 		}
 	})
 	return clone
@@ -596,13 +603,12 @@ func (f *FDTable) Fork(ctx context.Context) *FDTable {
 // Remove removes an FD from and returns a non-file iff successful.
 //
 // N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
+func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
 	if fd < 0 {
 		return nil, nil
 	}
 
 	f.mu.Lock()
-	defer f.mu.Unlock()
 
 	// Update current available position.
 	if fd < f.next {
@@ -618,24 +624,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 	case orig2 != nil:
 		orig2.IncRef()
 	}
+
 	if orig != nil || orig2 != nil {
-		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+		orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
 	}
+	f.mu.Unlock()
+
+	if orig != nil {
+		f.drop(ctx, orig)
+	}
+	if orig2 != nil {
+		f.dropVFS2(ctx, orig2)
+	}
+
 	return orig, orig2
 }
 
 // RemoveIf removes all FDs where cond is true.
 func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
+	// TODO(gvisor.dev/issue/1624): Remove fs.File slice.
+	var files []*fs.File
+	var filesVFS2 []*vfs.FileDescription
 
+	f.mu.Lock()
 	f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
 		if cond(file, fileVFS2, flags) {
-			f.set(fd, nil, FDFlags{}) // Clear from table.
+			df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
+			if df != nil {
+				files = append(files, df)
+			}
+			if dfVFS2 != nil {
+				filesVFS2 = append(filesVFS2, dfVFS2)
+			}
 			// Update current available position.
 			if fd < f.next {
 				f.next = fd
 			}
 		}
 	})
+	f.mu.Unlock()
+
+	for _, file := range files {
+		f.drop(ctx, file)
+	}
+
+	for _, file := range filesVFS2 {
+		f.dropVFS2(ctx, file)
+	}
 }
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index e3f30ba2a..bf5460083 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) {
 		}
 
 		i := int32(2)
-		fdTable.Remove(i)
+		fdTable.Remove(ctx, i)
 		if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i {
 			t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err)
 		}
@@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) {
 			t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err)
 		} else {
 			for _, fd := range fds {
-				fdTable.Remove(fd)
+				fdTable.Remove(ctx, fd)
 			}
 		}
 
@@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) {
 			t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref)
 		}
 
-		ref, _ := fdTable.Remove(1)
+		ref, _ := fdTable.Remove(ctx, 1)
 		if ref == nil {
 			t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success")
 		}
 		ref.DecRef(ctx)
 
-		if ref, _ := fdTable.Remove(1); ref != nil {
+		if ref, _ := fdTable.Remove(ctx, 1); ref != nil {
 			t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
 		}
 	})
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 7fd97dc53..3476551f3 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
@@ -30,12 +31,21 @@ type descriptorTable struct {
 	slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
 }
 
-// init initializes the table.
-func (f *FDTable) init() {
+// initNoLeakCheck initializes the table without enabling leak checking.
+//
+// This is used when loading an FDTable after S/R, during which the ref count
+// object itself will enable leak checking if necessary.
+func (f *FDTable) initNoLeakCheck() {
 	var slice []unsafe.Pointer // Empty slice.
 	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
 }
 
+// init initializes the table with leak checking.
+func (f *FDTable) init() {
+	f.initNoLeakCheck()
+	f.EnableLeakCheck()
+}
+
 // get gets a file entry.
 //
 // The boolean indicates whether this was in range.
@@ -76,33 +86,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo
 	return d.file, d.fileVFS2, d.flags, true
 }
 
-// set sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// CurrentMaxFDs returns the number of file descriptors that may be stored in f
+// without reallocation.
+func (f *FDTable) CurrentMaxFDs() int {
+	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	return len(slice)
+}
+
+// set sets an entry for VFS1, refer to setAll().
 //
 // Precondition: mu must be held.
-func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
-	f.setAll(fd, file, nil, flags)
+func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
+	dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
+	return dropFile
 }
 
-// setVFS2 sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setVFS2 sets an entry for VFS2, refer to setAll().
 //
 // Precondition: mu must be held.
-func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
-	f.setAll(fd, nil, file, flags)
+func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
+	_, dropFile := f.setAll(ctx, fd, nil, file, flags)
+	return dropFile
 }
 
-// setAll sets an entry.
-//
-// This handles accounting changes, as well as acquiring and releasing the
-// reference needed by the table iff the file is different.
+// setAll sets the file description referred to by fd to file/fileVFS2. If
+// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
+// an existing file description, it returns it with the FDTable's reference
+// transferred to the caller, which must call f.drop/dropVFS2() on the returned
+// file after unlocking f.mu.
 //
 // Precondition: mu must be held.
-func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
 	if file != nil && fileVFS2 != nil {
 		panic("VFS1 and VFS2 files set")
 	}
@@ -145,25 +159,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription,
 		}
 	}
 
-	// Drop the table reference.
+	// Adjust used.
+	switch {
+	case orig == nil && desc != nil:
+		atomic.AddInt32(&f.used, 1)
+	case orig != nil && desc == nil:
+		atomic.AddInt32(&f.used, -1)
+	}
+
 	if orig != nil {
 		switch {
 		case orig.file != nil:
 			if desc == nil || desc.file != orig.file {
-				f.drop(orig.file)
+				return orig.file, nil
 			}
 		case orig.fileVFS2 != nil:
 			if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
-				f.dropVFS2(orig.fileVFS2)
+				return nil, orig.fileVFS2
 			}
 		}
 	}
-
-	// Adjust used.
-	switch {
-	case orig == nil && desc != nil:
-		atomic.AddInt32(&f.used, 1)
-	case orig != nil && desc == nil:
-		atomic.AddInt32(&f.used, -1)
-	}
+	return nil, nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 8f2d36d5a..41fb2a784 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -30,7 +29,7 @@ import (
 //
 // +stateify savable
 type FSContext struct {
-	refs.AtomicRefCount
+	FSContextRefs
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 		cwd:   cwd,
 		umask: umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
@@ -77,95 +76,103 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
 		cwdVFS2:  cwd,
 		umask:    umask,
 	}
-	f.EnableLeakCheck("kernel.FSContext")
+	f.EnableLeakCheck()
 	return &f
 }
 
-// destroy is the destructor for an FSContext.
+// DecRef implements RefCounter.DecRef.
 //
-// This will call DecRef on both root and cwd Dirents.  If either call to
-// DecRef returns an error, then it will be propagated.  If both calls to
-// DecRef return an error, then the one from root.DecRef will be propagated.
+// When f reaches zero references, DecRef will be called on both root and cwd
+// Dirents.
 //
 // Note that there may still be calls to WorkingDirectory() or RootDirectory()
 // (that return nil).  This is because valid references may still be held via
 // proc files or other mechanisms.
-func (f *FSContext) destroy(ctx context.Context) {
-	// Hold f.mu so that we don't race with RootDirectory() and
-	// WorkingDirectory().
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	if VFS2Enabled {
-		f.rootVFS2.DecRef(ctx)
-		f.rootVFS2 = vfs.VirtualDentry{}
-		f.cwdVFS2.DecRef(ctx)
-		f.cwdVFS2 = vfs.VirtualDentry{}
-	} else {
-		f.root.DecRef(ctx)
-		f.root = nil
-		f.cwd.DecRef(ctx)
-		f.cwd = nil
-	}
-}
-
-// DecRef implements RefCounter.DecRef with destructor f.destroy.
 func (f *FSContext) DecRef(ctx context.Context) {
-	f.DecRefWithDestructor(ctx, f.destroy)
+	f.FSContextRefs.DecRef(func() {
+		// Hold f.mu so that we don't race with RootDirectory() and
+		// WorkingDirectory().
+		f.mu.Lock()
+		defer f.mu.Unlock()
+
+		if VFS2Enabled {
+			f.rootVFS2.DecRef(ctx)
+			f.rootVFS2 = vfs.VirtualDentry{}
+			f.cwdVFS2.DecRef(ctx)
+			f.cwdVFS2 = vfs.VirtualDentry{}
+		} else {
+			f.root.DecRef(ctx)
+			f.root = nil
+			f.cwd.DecRef(ctx)
+			f.cwd = nil
+		}
+	})
 }
 
 // Fork forks this FSContext.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
 	if VFS2Enabled {
+		if !f.cwdVFS2.Ok() {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwdVFS2.IncRef()
 		f.rootVFS2.IncRef()
 	} else {
+		if f.cwd == nil {
+			panic("FSContext.Fork() called after destroy")
+		}
 		f.cwd.IncRef()
 		f.root.IncRef()
 	}
 
-	return &FSContext{
+	ctx := &FSContext{
 		cwd:      f.cwd,
 		root:     f.root,
 		cwdVFS2:  f.cwdVFS2,
 		rootVFS2: f.rootVFS2,
 		umask:    f.umask,
 	}
+	ctx.EnableLeakCheck()
+	return ctx
 }
 
 // WorkingDirectory returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.cwd.IncRef()
+	if f.cwd != nil {
+		f.cwd.IncRef()
+	}
 	return f.cwd
 }
 
 // WorkingDirectoryVFS2 returns the current working directory.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.cwdVFS2.IncRef()
+	if f.cwdVFS2.Ok() {
+		f.cwdVFS2.IncRef()
+	}
 	return f.cwdVFS2
 }
 
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetWorkingDirectory called with nil dirent")
@@ -187,11 +194,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
 // SetWorkingDirectoryVFS2 sets the current working directory.
 // This will take an extra reference on the VirtualDentry.
 //
-// This is not a valid call after destroy.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
+	if !f.cwdVFS2.Ok() {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
+	}
+
 	old := f.cwdVFS2
 	f.cwdVFS2 = d
 	d.IncRef()
@@ -200,8 +211,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe
 
 // RootDirectory returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return nil if called after f is destroyed, otherwise it will return
+// a Dirent with a reference taken.
 func (f *FSContext) RootDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -213,20 +224,22 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 
 // RootDirectoryVFS2 returns the current filesystem root.
 //
-// This will return nil if called after destroy(), otherwise it will return a
-// Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
 func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.rootVFS2.IncRef()
+	if f.rootVFS2.Ok() {
+		f.rootVFS2.IncRef()
+	}
 	return f.rootVFS2
 }
 
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetRootDirectory called with nil dirent")
@@ -247,7 +260,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
 
 // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
 //
-// This is not a valid call after free.
+// This is not a valid call after f is destroyed.
 func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
 	if !vd.Ok() {
 		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..b87e40dd1 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
 //
 // +stateify savable
 type IPCNamespace struct {
+	IPCNamespaceRefs
+
 	// User namespace which owns this IPC namespace. Immutable.
 	userNS *auth.UserNamespace
 
@@ -33,11 +36,13 @@ type IPCNamespace struct {
 
 // NewIPCNamespace creates a new IPC namespace.
 func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
-	return &IPCNamespace{
+	ns := &IPCNamespace{
 		userNS:     userNS,
 		semaphores: semaphore.NewRegistry(userNS),
 		shms:       shm.NewRegistry(userNS),
 	}
+	ns.EnableLeakCheck()
+	return ns
 }
 
 // SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
 	return i.shms
 }
 
+// DecRef implements refsvfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+	i.IPCNamespaceRefs.DecRef(func() {
+		i.shms.Release(ctx)
+	})
+}
+
 // IPCNamespace returns the task's IPC namespace.
 func (t *Task) IPCNamespace() *IPCNamespace {
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go
new file mode 100644
index 000000000..4fcdfc541
--- /dev/null
+++ b/pkg/sentry/kernel/kcov.go
@@ -0,0 +1,338 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
+// area. On Linux, the maximum is INT_MAX / 8.
+const kcovAreaSizeMax = 10 * 1024 * 1024
+
+// Kcov provides kernel coverage data to userspace through a memory-mapped
+// region, as kcov does in Linux.
+//
+// To give the illusion that the data is always up to date, we update the shared
+// memory every time before we return to userspace.
+type Kcov struct {
+	// mfp provides application memory. It is immutable after creation.
+	mfp pgalloc.MemoryFileProvider
+
+	// mu protects all of the fields below.
+	mu sync.RWMutex
+
+	// mode is the current kcov mode.
+	mode uint8
+
+	// size is the size of the mapping through which the kernel conveys coverage
+	// information to userspace.
+	size uint64
+
+	// owningTask is the task that currently owns coverage data on the system. The
+	// interface for kcov essentially requires that coverage is only going to a
+	// single task. Note that kcov should only generate coverage data for the
+	// owning task, but we currently generate global coverage.
+	owningTask *Task
+
+	// count is a locally cached version of the first uint64 in the kcov data,
+	// which is the number of subsequent entries representing PCs.
+	//
+	// It is used with kcovInode.countBlock(), to copy in/out the first element of
+	// the actual data in an efficient manner, avoid boilerplate, and prevent
+	// accidental garbage escapes by the temporary counts.
+	count uint64
+
+	mappable *mm.SpecialMappable
+}
+
+// NewKcov creates and returns a Kcov instance.
+func (k *Kernel) NewKcov() *Kcov {
+	return &Kcov{
+		mfp: k,
+	}
+}
+
+var coveragePool = sync.Pool{
+	New: func() interface{} {
+		return make([]byte, 0)
+	},
+}
+
+// TaskWork implements TaskWorker.TaskWork.
+func (kcov *Kcov) TaskWork(t *Task) {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_TRACE_PC {
+		return
+	}
+
+	rw := &kcovReadWriter{
+		mf: kcov.mfp.MemoryFile(),
+		fr: kcov.mappable.FileRange(),
+	}
+
+	// Read in the PC count.
+	if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
+	}
+
+	rw.off = 8 * (1 + kcov.count)
+	n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})
+
+	// Update the pc count, based on the number of entries written. Note that if
+	// we reached the end of the kcov area, we may not have written everything in
+	// output.
+	kcov.count += uint64(n / 8)
+	rw.off = 0
+	if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
+		panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
+	}
+
+	// Re-register for future work.
+	t.RegisterWork(kcov)
+}
+
+// InitTrace performs the KCOV_INIT_TRACE ioctl.
+func (kcov *Kcov) InitTrace(size uint64) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_DISABLED {
+		return syserror.EBUSY
+	}
+
+	// To simplify all the logic around mapping, we require that the length of the
+	// shared region is a multiple of the system page size.
+	if (8*size)&(usermem.PageSize-1) != 0 {
+		return syserror.EINVAL
+	}
+
+	// We need space for at least two uint64s to hold current position and a
+	// single PC.
+	if size < 2 || size > kcovAreaSizeMax {
+		return syserror.EINVAL
+	}
+
+	kcov.size = size
+	kcov.mode = linux.KCOV_MODE_INIT
+	return nil
+}
+
+// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
+func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	// KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
+	if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
+		return syserror.EINVAL
+	}
+
+	switch traceKind {
+	case linux.KCOV_TRACE_PC:
+		kcov.mode = linux.KCOV_MODE_TRACE_PC
+	case linux.KCOV_TRACE_CMP:
+		// We do not support KCOV_MODE_TRACE_CMP.
+		return syserror.ENOTSUP
+	default:
+		return syserror.EINVAL
+	}
+
+	if kcov.owningTask != nil && kcov.owningTask != t {
+		return syserror.EBUSY
+	}
+
+	kcov.owningTask = t
+	t.SetKcov(kcov)
+	t.RegisterWork(kcov)
+
+	// Clear existing coverage data; the task expects to read only coverage data
+	// from the time it is activated.
+	coverage.ClearCoverageData()
+	return nil
+}
+
+// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
+func (kcov *Kcov) DisableTrace(ctx context.Context) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	t := TaskFromContext(ctx)
+	if t == nil {
+		panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
+	}
+
+	if t != kcov.owningTask {
+		return syserror.EINVAL
+	}
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	if kcov.mappable != nil {
+		kcov.mappable.DecRef(ctx)
+		kcov.mappable = nil
+	}
+	return nil
+}
+
+// Clear resets the mode and clears the owning task and memory mapping for kcov.
+// It is called when the fd corresponding to kcov is closed. Note that the mode
+// needs to be set so that the next call to kcov.TaskWork() will exit early.
+func (kcov *Kcov) Clear(ctx context.Context) {
+	kcov.mu.Lock()
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	if kcov.mappable != nil {
+		kcov.mappable.DecRef(ctx)
+		kcov.mappable = nil
+	}
+	kcov.mu.Unlock()
+}
+
+// OnTaskExit is called when the owning task exits. It is similar to
+// kcov.Clear(), except the memory mapping is not cleared, so that the same
+// mapping can be used in the future if kcov is enabled again by another task.
+func (kcov *Kcov) OnTaskExit() {
+	kcov.mu.Lock()
+	kcov.mode = linux.KCOV_MODE_INIT
+	kcov.owningTask = nil
+	kcov.mu.Unlock()
+}
+
+// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
+// implement vfs.FileDescription.ConfigureMMap.
+func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	kcov.mu.Lock()
+	defer kcov.mu.Unlock()
+
+	if kcov.mode != linux.KCOV_MODE_INIT {
+		return syserror.EINVAL
+	}
+
+	if kcov.mappable == nil {
+		// Set up the kcov area.
+		fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
+		if err != nil {
+			return err
+		}
+
+		// Get the thread id for the mmap name.
+		t := TaskFromContext(ctx)
+		if t == nil {
+			panic("ThreadFromContext returned nil")
+		}
+		// For convenience, a special mappable is used here. Note that these mappings
+		// will look different under /proc/[pid]/maps than they do on Linux.
+		kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
+	}
+	kcov.mappable.IncRef()
+	opts.Mappable = kcov.mappable
+	opts.MappingIdentity = kcov.mappable
+	return nil
+}
+
+// kcovReadWriter implements safemem.Reader and safemem.Writer.
+type kcovReadWriter struct {
+	off uint64
+	mf  *pgalloc.MemoryFile
+	fr  memmap.FileRange
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the read to the kcov range and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if rend := start + dsts.NumBytes(); rend < end {
+		end = rend
+	}
+
+	// Get internal mappings.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy from internal mappings.
+	n, err := safemem.CopySeq(dsts, bs)
+	rw.off += n
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// Limit the write to the kcov area and check for overflow.
+	if rw.fr.Length() <= rw.off {
+		return 0, io.EOF
+	}
+	start := rw.fr.Start + rw.off
+	end := rw.fr.Start + rw.fr.Length()
+	if wend := start + srcs.NumBytes(); wend < end {
+		end = wend
+	}
+
+	// Get internal mapping.
+	bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write)
+	if err != nil {
+		return 0, err
+	}
+
+	// Copy to internal mapping.
+	n, err := safemem.CopySeq(bs, srcs)
+	rw.off += n
+	return n, err
+}
+
+// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
+type kcovIOWriter struct {
+	rw *kcovReadWriter
+}
+
+// Write implements io.Writer.Write.
+func (w *kcovIOWriter) Write(p []byte) (int, error) {
+	bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
+	n, err := safemem.WriteFullFromBlocks(w.rw, bs)
+	return int(n), err
+}
diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go
new file mode 100644
index 000000000..6f8a0266b
--- /dev/null
+++ b/pkg/sentry/kernel/kcov_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// countBlock provides a safemem.BlockSeq for kcov.count.
+//
+// Like k.count, the block returned is protected by k.mu.
+func (kcov *Kcov) countBlock() safemem.BlockSeq {
+	return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count))))
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1028d13c6..9b2be44d4 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -39,6 +39,7 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
@@ -220,13 +221,18 @@ type Kernel struct {
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 
-	// sockets is the list of all network sockets the system. Protected by
-	// extMu.
+	// sockets is the list of all network sockets in the system.
+	// Protected by extMu.
+	// TODO(gvisor.dev/issue/1624): Only used by VFS1.
 	sockets socketList
 
-	// nextSocketEntry is the next entry number to use in sockets. Protected
+	// socketsVFS2 records all network sockets in the system. Protected by
+	// extMu.
+	socketsVFS2 map[*vfs.FileDescription]*SocketRecord
+
+	// nextSocketRecord is the next entry number to use in sockets. Protected
 	// by extMu.
-	nextSocketEntry uint64
+	nextSocketRecord uint64
 
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -248,7 +254,7 @@ type Kernel struct {
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
 
-	// VFS keeps the filesystem state used across the kernel.
+	// vfs keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
 	// hostMount is the Mount used for file descriptors that were imported
@@ -335,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		return fmt.Errorf("Timekeeper is nil")
 	}
 	if args.Timekeeper.clocks == nil {
-		return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
 	}
 	if args.RootUserNamespace == nil {
 		return fmt.Errorf("RootUserNamespace is nil")
@@ -360,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 		k.useHostCores = true
 		maxCPU, err := hostcpu.MaxPossibleCPU()
 		if err != nil {
-			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+			return fmt.Errorf("failed to get maximum CPU number: %v", err)
 		}
 		minAppCores := uint(maxCPU) + 1
 		if k.applicationCores < minAppCores {
@@ -414,6 +420,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 			return fmt.Errorf("failed to create sockfs mount: %v", err)
 		}
 		k.socketMount = socketMount
+
+		k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
 	}
 
 	return nil
@@ -422,9 +430,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 // SaveTo saves the state of k to w.
 //
 // Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w wire.Writer) error {
+func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
 	saveStart := time.Now()
-	ctx := k.SupervisorContext()
 
 	// Do not allow other Kernel methods to affect it while it's being saved.
 	k.extMu.Lock()
@@ -438,38 +445,55 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 	k.mf.StartEvictions()
 	k.mf.WaitForEvictions()
 
-	// Flush write operations on open files so data reaches backing storage.
-	// This must come after MemoryFile eviction since eviction may cause file
-	// writes.
-	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
-		return err
-	}
+	if VFS2Enabled {
+		// Discard unsavable mappings, such as those for host file descriptors.
+		if err := k.invalidateUnsavableMappings(ctx); err != nil {
+			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		}
 
-	// Remove all epoll waiter objects from underlying wait queues.
-	// NOTE: for programs to resume execution in future snapshot scenarios,
-	// we will need to re-establish these waiter objects after saving.
-	k.tasks.unregisterEpollWaiters(ctx)
+		// Prepare filesystems for saving. This must be done after
+		// invalidateUnsavableMappings(), since dropping memory mappings may
+		// affect filesystem state (e.g. page cache reference counts).
+		if err := k.vfs.PrepareSave(ctx); err != nil {
+			return err
+		}
+	} else {
+		// Flush cached file writes to backing storage. This must come after
+		// MemoryFile eviction since eviction may cause file writes.
+		if err := k.flushWritesToFiles(ctx); err != nil {
+			return err
+		}
 
-	// Clear the dirent cache before saving because Dirents must be Loaded in a
-	// particular order (parents before children), and Loading dirents from a cache
-	// breaks that order.
-	if err := k.flushMountSourceRefs(ctx); err != nil {
-		return err
-	}
+		// Remove all epoll waiter objects from underlying wait queues.
+		// NOTE: for programs to resume execution in future snapshot scenarios,
+		// we will need to re-establish these waiter objects after saving.
+		k.tasks.unregisterEpollWaiters(ctx)
 
-	// Ensure that all inode and mount release operations have completed.
-	fs.AsyncBarrier()
+		// Clear the dirent cache before saving because Dirents must be Loaded in a
+		// particular order (parents before children), and Loading dirents from a cache
+		// breaks that order.
+		if err := k.flushMountSourceRefs(ctx); err != nil {
+			return err
+		}
+
+		// Ensure that all inode and mount release operations have completed.
+		fs.AsyncBarrier()
 
-	// Once all fs work has completed (flushed references have all been released),
-	// reset mount mappings. This allows individual mounts to save how inodes map
-	// to filesystem resources. Without this, fs.Inodes cannot be restored.
-	fs.SaveInodeMappings()
+		// Once all fs work has completed (flushed references have all been released),
+		// reset mount mappings. This allows individual mounts to save how inodes map
+		// to filesystem resources. Without this, fs.Inodes cannot be restored.
+		fs.SaveInodeMappings()
 
-	// Discard unsavable mappings, such as those for host file descriptors.
-	// This must be done after waiting for "asynchronous fs work", which
-	// includes async I/O that may touch application memory.
-	if err := k.invalidateUnsavableMappings(ctx); err != nil {
-		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		// Discard unsavable mappings, such as those for host file descriptors.
+		// This must be done after waiting for "asynchronous fs work", which
+		// includes async I/O that may touch application memory.
+		//
+		// TODO(gvisor.dev/issue/1624): This rationale is believed to be
+		// obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
+		// but this order is conservatively retained for VFS1.
+		if err := k.invalidateUnsavableMappings(ctx); err != nil {
+			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		}
 	}
 
 	// Save the CPUID FeatureSet before the rest of the kernel so we can
@@ -478,14 +502,14 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 	//
 	// N.B. This will also be saved along with the full kernel save below.
 	cpuidStart := time.Now()
-	if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
+	if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
 		return err
 	}
 	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
 
 	// Save the kernel state.
 	kernelStart := time.Now()
-	stats, err := state.Save(k.SupervisorContext(), w, k)
+	stats, err := state.Save(ctx, w, k)
 	if err != nil {
 		return err
 	}
@@ -494,7 +518,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 
 	// Save the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
+	if err := k.mf.SaveTo(ctx, w); err != nil {
 		return err
 	}
 	log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -506,6 +530,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 
 // flushMountSourceRefs flushes the MountSources for all mounted filesystems
 // and open FDs.
+//
+// Preconditions: !VFS2Enabled.
 func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
 	// Flush all mount sources for currently mounted filesystems in each task.
 	flushed := make(map[*fs.MountNamespace]struct{})
@@ -533,11 +559,6 @@ func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
 //
 // Precondition: Must be called with the kernel paused.
 func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	if VFS2Enabled {
-		return nil
-	}
-
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -554,9 +575,9 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
 	return err
 }
 
-func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
+// Preconditions: !VFS2Enabled.
+func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
+	return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
 		}
@@ -578,37 +599,8 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 	})
 }
 
-// Preconditions: The kernel must be paused.
-func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
-	invalidated := make(map[*mm.MemoryManager]struct{})
-	k.tasks.mu.RLock()
-	defer k.tasks.mu.RUnlock()
-	for t := range k.tasks.Root.tids {
-		// We can skip locking Task.mu here since the kernel is paused.
-		if mm := t.tc.MemoryManager; mm != nil {
-			if _, ok := invalidated[mm]; !ok {
-				if err := mm.InvalidateUnsavable(ctx); err != nil {
-					return err
-				}
-				invalidated[mm] = struct{}{}
-			}
-		}
-		// I really wish we just had a sync.Map of all MMs...
-		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
-			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
-				return err
-			}
-		}
-	}
-	return nil
-}
-
+// Preconditions: !VFS2Enabled.
 func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	if VFS2Enabled {
-		return
-	}
-
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 
@@ -633,8 +625,33 @@ func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
 	}
 }
 
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.tc.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
 	loadStart := time.Now()
 
 	initAppCores := k.applicationCores
@@ -645,7 +662,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 	// don't need to explicitly install it in the Kernel.
 	cpuidStart := time.Now()
 	var features cpuid.FeatureSet
-	if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
+	if _, err := state.Load(ctx, r, &features); err != nil {
 		return err
 	}
 	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -660,7 +677,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 
 	// Load the kernel state.
 	kernelStart := time.Now()
-	stats, err := state.Load(k.SupervisorContext(), r, k)
+	stats, err := state.Load(ctx, r, k)
 	if err != nil {
 		return err
 	}
@@ -673,7 +690,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 
 	// Load the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
+	if err := k.mf.LoadFrom(ctx, r); err != nil {
 		return err
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -685,11 +702,17 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 		net.Resume()
 	}
 
-	// Ensure that all pending asynchronous work is complete:
-	//   - namedpipe opening
-	//   - inode file opening
-	if err := fs.AsyncErrorBarrier(); err != nil {
-		return err
+	if VFS2Enabled {
+		if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
+			return err
+		}
+	} else {
+		// Ensure that all pending asynchronous work is complete:
+		//   - namedpipe opening
+		//   - inode file opening
+		if err := fs.AsyncErrorBarrier(); err != nil {
+			return err
+		}
 	}
 
 	tcpip.AsyncLoading.Wait()
@@ -818,7 +841,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return ctx.args.UTSNamespace
 	case CtxIPCNamespace:
-		return ctx.args.IPCNamespace
+		ipcns := ctx.args.IPCNamespace
+		ipcns.IncRef()
+		return ipcns
 	case auth.CtxCredentials:
 		return ctx.args.Credentials
 	case fs.CtxRoot:
@@ -831,14 +856,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		if ctx.args.MountNamespaceVFS2 == nil {
 			return nil
 		}
-		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
-		return ctx.args.MountNamespaceVFS2.Root()
+		root := ctx.args.MountNamespaceVFS2.Root()
+		root.IncRef()
+		return root
 	case vfs.CtxMountNamespace:
 		if ctx.k.globalInit == nil {
 			return nil
 		}
-		// MountNamespaceVFS2 takes a reference for us.
-		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns.IncRef()
+		return mntns
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -888,19 +915,19 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		opener    fsbridge.Lookup
 		fsContext *FSContext
 		mntns     *fs.MountNamespace
+		mntnsVFS2 *vfs.MountNamespace
 	)
 
 	if VFS2Enabled {
-		mntnsVFS2 := args.MountNamespaceVFS2
+		mntnsVFS2 = args.MountNamespaceVFS2
 		if mntnsVFS2 == nil {
-			// MountNamespaceVFS2 adds a reference to the namespace, which is
-			// transferred to the new process.
+			// Add a reference to the namespace, which is transferred to the new process.
 			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
+			mntnsVFS2.IncRef()
 		}
 		// Get the root directory from the MountNamespace.
-		root := args.MountNamespaceVFS2.Root()
-		// The call to newFSContext below will take a reference on root, so we
-		// don't need to hold this one.
+		root := mntnsVFS2.Root()
+		root.IncRef()
 		defer root.DecRef(ctx)
 
 		// Grab the working directory.
@@ -952,6 +979,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+	cu := cleanup.Make(func() {
+		tg.Release(ctx)
+	})
+	defer cu.Clean()
 
 	// Check which file to start from.
 	switch {
@@ -1008,16 +1039,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
-		MountNamespaceVFS2:      args.MountNamespaceVFS2,
+		MountNamespaceVFS2:      mntnsVFS2,
 		ContainerID:             args.ContainerID,
 	}
-	t, err := k.tasks.NewTask(config)
+	t, err := k.tasks.NewTask(ctx, config)
 	if err != nil {
 		return nil, 0, err
 	}
 	t.traceExecEvent(tc) // Simulate exec for tracing.
 
 	// Success.
+	cu.Release()
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
 	if k.globalInit == nil {
 		k.globalInit = tg
@@ -1067,8 +1099,9 @@ func (k *Kernel) Start() error {
 
 // pauseTimeLocked pauses all Timers and Timekeeper updates.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
 	// Kernel.Start().
@@ -1111,8 +1144,9 @@ func (k *Kernel) pauseTimeLocked(ctx context.Context) {
 // pauseTimeLocked has not been previously called, resumeTimeLocked has no
 // effect.
 //
-// Preconditions: Any task goroutines running in k must be stopped. k.extMu
-// must be locked.
+// Preconditions:
+// * Any task goroutines running in k must be stopped.
+// * k.extMu must be locked.
 func (k *Kernel) resumeTimeLocked(ctx context.Context) {
 	if k.cpuClockTicker != nil {
 		k.cpuClockTicker.Resume()
@@ -1360,8 +1394,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
 	return k.rootUTSNamespace
 }
 
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
 func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	k.rootIPCNamespace.IncRef()
 	return k.rootIPCNamespace
 }
 
@@ -1506,20 +1541,27 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
-// SocketEntry represents a socket recorded in Kernel.sockets. It implements
+// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
+//
+// +stateify savable
+type SocketRecord struct {
+	k        *Kernel
+	Sock     *refs.WeakRef        // TODO(gvisor.dev/issue/1624): Only used by VFS1.
+	SockVFS2 *vfs.FileDescription // Only used by VFS2.
+	ID       uint64               // Socket table entry number.
+}
+
+// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
 // refs.WeakRefUser for sockets stored in the socket table.
 //
 // +stateify savable
-type SocketEntry struct {
+type SocketRecordVFS1 struct {
 	socketEntry
-	k        *Kernel
-	Sock     *refs.WeakRef
-	SockVFS2 *vfs.FileDescription
-	ID       uint64 // Socket table entry number.
+	SocketRecord
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *SocketEntry) WeakRefGone(context.Context) {
+func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
 	s.k.extMu.Lock()
 	s.k.sockets.Remove(s)
 	s.k.extMu.Unlock()
@@ -1530,9 +1572,14 @@ func (s *SocketEntry) WeakRefGone(context.Context) {
 // Precondition: Caller must hold a reference to sock.
 func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Lock()
-	id := k.nextSocketEntry
-	k.nextSocketEntry++
-	s := &SocketEntry{k: k, ID: id}
+	id := k.nextSocketRecord
+	k.nextSocketRecord++
+	s := &SocketRecordVFS1{
+		SocketRecord: SocketRecord{
+			k:  k,
+			ID: id,
+		},
+	}
 	s.Sock = refs.NewWeakRef(sock, s)
 	k.sockets.PushBack(s)
 	k.extMu.Unlock()
@@ -1544,29 +1591,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
 // Precondition: Caller must hold a reference to sock.
 //
 // Note that the socket table will not hold a reference on the
-// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+// vfs.FileDescription.
 func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
 	k.extMu.Lock()
-	id := k.nextSocketEntry
-	k.nextSocketEntry++
-	s := &SocketEntry{
+	if _, ok := k.socketsVFS2[sock]; ok {
+		panic(fmt.Sprintf("Socket %p added twice", sock))
+	}
+	id := k.nextSocketRecord
+	k.nextSocketRecord++
+	s := &SocketRecord{
 		k:        k,
 		ID:       id,
 		SockVFS2: sock,
 	}
-	k.sockets.PushBack(s)
+	k.socketsVFS2[sock] = s
+	k.extMu.Unlock()
+}
+
+// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
+func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	delete(k.socketsVFS2, sock)
 	k.extMu.Unlock()
 }
 
 // ListSockets returns a snapshot of all sockets.
 //
-// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
 // to get a reference on a socket in the table.
-func (k *Kernel) ListSockets() []*SocketEntry {
+func (k *Kernel) ListSockets() []*SocketRecord {
 	k.extMu.Lock()
-	var socks []*SocketEntry
-	for s := k.sockets.Front(); s != nil; s = s.Next() {
-		socks = append(socks, s)
+	var socks []*SocketRecord
+	if VFS2Enabled {
+		for _, s := range k.socketsVFS2 {
+			socks = append(socks, s)
+		}
+	} else {
+		for s := k.sockets.Front(); s != nil; s = s.Next() {
+			socks = append(socks, &s.SocketRecord)
+		}
 	}
 	k.extMu.Unlock()
 	return socks
@@ -1594,7 +1657,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return ctx.k.rootUTSNamespace
 	case CtxIPCNamespace:
-		return ctx.k.rootIPCNamespace
+		ipcns := ctx.k.rootIPCNamespace
+		ipcns.IncRef()
+		return ipcns
 	case auth.CtxCredentials:
 		// The supervisor context is global root.
 		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
@@ -1607,16 +1672,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		if ctx.k.globalInit == nil {
 			return vfs.VirtualDentry{}
 		}
-		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
-		defer mntns.DecRef(ctx)
-		// Root() takes a reference on the root dirent for us.
-		return mntns.Root()
+		root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
+		root.IncRef()
+		return root
 	case vfs.CtxMountNamespace:
 		if ctx.k.globalInit == nil {
 			return nil
 		}
-		// MountNamespaceVFS2() takes a reference for us.
-		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		mntns.IncRef()
+		return mntns
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -1697,3 +1762,20 @@ func (k *Kernel) ShmMount() *vfs.Mount {
 func (k *Kernel) SocketMount() *vfs.Mount {
 	return k.socketMount
 }
+
+// Release releases resources owned by k.
+//
+// Precondition: This should only be called after the kernel is fully
+// initialized, e.g. after k.Start() has been called.
+func (k *Kernel) Release() {
+	ctx := k.SupervisorContext()
+	if VFS2Enabled {
+		k.hostMount.DecRef(ctx)
+		k.pipeMount.DecRef(ctx)
+		k.shmMount.DecRef(ctx)
+		k.socketMount.DecRef(ctx)
+		k.vfs.Release(ctx)
+	}
+	k.timekeeper.Destroy()
+	k.vdso.Release(ctx)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 449643118..99134e634 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/amutex",
         "//pkg/buffer",
         "//pkg/context",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 297e8f28f..67beb0ad6 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,6 +17,7 @@ package pipe
 
 import (
 	"fmt"
+	"io"
 	"sync/atomic"
 	"syscall"
 
@@ -200,22 +201,22 @@ type readOps struct {
 //
 // Precondition: this pipe must have readers.
 func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
-	// Don't block for a zero-length read even if the pipe is empty.
-	if ops.left() == 0 {
-		return 0, nil
-	}
-
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	return p.readLocked(ctx, ops)
 }
 
 func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
+	// Don't block for a zero-length read even if the pipe is empty.
+	if ops.left() == 0 {
+		return 0, nil
+	}
+
 	// Is the pipe empty?
 	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
-			return 0, nil
+			return 0, io.EOF
 		}
 		return 0, syserror.ErrWouldBlock
 	}
@@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.queuedLocked()
+}
+
+func (p *Pipe) queuedLocked() int64 {
 	return p.view.Size()
 }
 
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 6d58b682f..f665920cb 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 			v = math.MaxInt32 // Silently truncate.
 		}
 		// Copy result to userspace.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		iocc := primitive.IOCopyContext{
+			IO:  io,
+			Ctx: ctx,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}
+		_, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
 		return 0, err
 	default:
 		return 0, syscall.ENOTTY
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 28f998e45..d96bf253b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -33,6 +33,8 @@ import (
 
 // VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
 // not be copied.
+//
+// +stateify savable
 type VFSPipe struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -67,6 +69,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
 	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
 }
 
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+	return syserror.ESPIPE
+}
+
 // Open opens the pipe represented by vp.
 func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
@@ -159,6 +166,8 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, l
 // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
 // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
 // other FileDescriptions for splice(2) and tee(2).
+//
+// +stateify savable
 type VFSPipeFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -232,8 +241,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 
 // PipeSize implements fcntl(F_GETPIPE_SZ).
 func (fd *VFSPipeFD) PipeSize() int64 {
-	// Inline Pipe.FifoSize() rather than calling it with nil Context and
-	// fs.File and ignoring the returned error (which is always nil).
+	// Inline Pipe.FifoSize() since we don't have a fs.File.
 	fd.pipe.mu.Lock()
 	defer fd.pipe.mu.Unlock()
 	return fd.pipe.max
@@ -244,19 +252,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 	return fd.pipe.SetFifoSize(size)
 }
 
-// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
-// or writes up to count bytes to, fd.
-func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
-	return usermem.IOSequence{
+// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
+func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	// Cap the sequence at number of bytes actually available.
+	v := fd.pipe.queuedLocked()
+	if v < count {
+		count = v
+	}
+	src := usermem.IOSequence{
 		IO:    fd,
 		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
 	}
+
+	var (
+		n   int64
+		err error
+	)
+	if off == -1 {
+		n, err = out.Write(ctx, src, vfs.WriteOptions{})
+	} else {
+		n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
+	}
+	if n > 0 {
+		fd.pipe.view.TrimFront(n)
+	}
+	return n, err
+}
+
+// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
+func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+
+	dst := usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+
+	if off == -1 {
+		return in.Read(ctx, dst, vfs.ReadOptions{})
+	}
+	return in.PRead(ctx, dst, off, vfs.ReadOptions{})
 }
 
-// CopyIn implements usermem.IO.CopyIn.
+// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(dst))
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return int64(len(dst))
 		},
@@ -265,7 +311,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadAt(dst, 0)
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -281,7 +326,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte,
 // CopyOut implements usermem.IO.CopyOut.
 func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
 	origCount := int64(len(src))
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return int64(len(src))
 		},
@@ -305,7 +350,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte,
 // ZeroOut implements usermem.IO.ZeroOut.
 func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
 	origCount := toZero
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return toZero
 		},
@@ -326,14 +371,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6
 	return n, err
 }
 
-// CopyInTo implements usermem.IO.CopyInTo.
+// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
+// responsibility to trim fd.pipe.view after the read is completed.
 func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
 	count := ars.NumBytes()
 	if count == 0 {
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.read(ctx, readOps{
+	n, err := fd.pipe.readLocked(ctx, readOps{
 		left: func() int64 {
 			return count
 		},
@@ -342,7 +388,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst
 		},
 		read: func(view *buffer.View) (int64, error) {
 			n, err := view.ReadToSafememWriter(dst, uint64(count))
-			view.TrimFront(int64(n))
 			return int64(n), err
 		},
 	})
@@ -362,7 +407,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq,
 		return 0, nil
 	}
 	origCount := count
-	n, err := fd.pipe.write(ctx, writeOps{
+	n, err := fd.pipe.writeLocked(ctx, writeOps{
 		left: func() int64 {
 			return count
 		},
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 619b0cb7c..1145faf13 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool {
 // beginPtraceStopLocked does not signal t's tracer or wake it if it is
 // waiting.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) beginPtraceStopLocked() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) {
 // ptraceStop, temporarily preventing it from being removed by a concurrent
 // Task.Kill, and returns true. Otherwise it returns false.
 //
-// Preconditions: The TaskSet mutex must be locked. The caller must be running
-// on the task goroutine of t's tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked.
+// * The caller must be running on the task goroutine of t's tracer.
 func (t *Task) ptraceFreeze() bool {
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
@@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() {
 	t.ptraceUnfreezeLocked()
 }
 
-// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
-// locked.
+// Preconditions:
+// * t must be in a frozen ptraceStop.
+// * t's signal mutex must be locked.
 func (t *Task) ptraceUnfreezeLocked() {
 	// Do this even if the task has been killed to ensure a panic if t.stop is
 	// nil or not a ptraceStop.
@@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() {
 // ptraceSignalLocked is called after signal dequeueing to check if t should
 // enter ptrace signal-delivery-stop.
 //
-// Preconditions: The signal mutex must be locked. The caller must be running
-// on the task goroutine.
+// Preconditions:
+// * The signal mutex must be locked.
+// * The caller must be running on the task goroutine.
 func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
 	if linux.Signal(info.Signo) == linux.SIGKILL {
 		return false
@@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error {
 	return nil
 }
 
-// Preconditions: The TaskSet mutex must be locked for writing. t must have a
-// tracer.
+// Preconditions:
+// * The TaskSet mutex must be locked for writing.
+// * t must have a tracer.
 func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
 	const valid = uintptr(linux.PTRACE_O_EXITKILL |
 		linux.PTRACE_O_TRACESYSGOOD |
@@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		// at the address specified by the data parameter, and the return value
 		// is the error flag." - ptrace(2)
 		word := t.Arch().Native(0)
-		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
-			IgnorePermissions: true,
-		}); err != nil {
+		if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
 			return err
 		}
-		_, err := t.CopyOut(data, word)
+		_, err := word.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
-		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
-			IgnorePermissions: true,
-		})
+		word := t.Arch().Native(uintptr(data))
+		_, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
 		return err
 
 	case linux.PTRACE_GETREGSET:
@@ -1073,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if target.ptraceSiginfo == nil {
 			return syserror.EINVAL
 		}
-		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		_, err := target.ptraceSiginfo.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_SETSIGINFO:
 		var info arch.SignalInfo
-		if _, err := t.CopyIn(data, &info); err != nil {
+		if _, err := info.CopyIn(t, data); err != nil {
 			return err
 		}
 		t.tg.pidns.owner.mu.RLock()
@@ -1093,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if addr != linux.SignalSetSize {
 			return syserror.EINVAL
 		}
-		_, err := t.CopyOut(data, target.SignalMask())
+		mask := target.SignalMask()
+		_, err := mask.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_SETSIGMASK:
@@ -1101,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 			return syserror.EINVAL
 		}
 		var mask linux.SignalSet
-		if _, err := t.CopyIn(data, &mask); err != nil {
+		if _, err := mask.CopyIn(t, data); err != nil {
 			return err
 		}
 		// The target's task goroutine is stopped, so this is safe:
@@ -1116,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	case linux.PTRACE_GETEVENTMSG:
 		t.tg.pidns.owner.mu.RLock()
 		defer t.tg.pidns.owner.mu.RUnlock()
-		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		_, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
 		return err
 
 	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index cef1276ec..609ad3941 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro
 		if err != nil {
 			return err
 		}
-		_, err = t.CopyOut(data, n)
+		_, err = n.CopyOut(t, data)
 		return err
 
 	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 18416643b..2a9023fdf 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr {
 // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
 // t's CPU number.
 //
-// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions:
+// * t.RSeqAvailable() == true.
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	t.oldRSeqCPUAddr = addr
 
@@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
 	return nil
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqUpdateCPU() error {
 	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
 		t.rseqCPU = -1
@@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error {
 	return oerr
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) oldRSeqCopyOutCPU() error {
 	if t.oldRSeqCPUAddr == 0 {
 		return nil
@@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqCopyOutCPU() error {
 	if t.rseqAddr == 0 {
 		return nil
@@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error {
 	return err
 }
 
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqClearCPU() error {
 	buf := t.CopyScratchBuffer(8)
 	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
@@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error {
 //
 // See kernel/rseq.c:rseq_ip_fixup for reference.
 //
-// Preconditions: The caller must be running on the task goroutine. t's
-// AddressSpace must be active.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) rseqAddrInterrupt() {
 	if t.rseqAddr == 0 {
 		return
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index c38c5a40c..387edfa91 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -18,7 +18,6 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -27,25 +26,18 @@ import (
 
 const maxSyscallFilterInstructions = 1 << 15
 
-// seccompData is equivalent to struct seccomp_data, which contains the data
-// passed to seccomp-bpf filters.
-type seccompData struct {
-	// nr is the system call number.
-	nr int32
-
-	// arch is an AUDIT_ARCH_* value indicating the system call convention.
-	arch uint32
-
-	// instructionPointer is the value of the instruction pointer at the time
-	// of the system call.
-	instructionPointer uint64
-
-	// args contains the first 6 system call arguments.
-	args [6]uint64
-}
-
-func (d *seccompData) asBPFInput() bpf.Input {
-	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+// dataAsBPFInput returns a serialized BPF program, only valid on the current task
+// goroutine.
+//
+// Note: this is called for every syscall, which is a very hot path.
+func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
+	buf := t.CopyScratchBuffer(d.SizeBytes())
+	d.MarshalUnsafe(buf)
+	return bpf.InputBytes{
+		Data: buf,
+		// Go-marshal always uses the native byte order.
+		Order: usermem.ByteOrder,
+	}
 }
 
 func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
@@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 }
 
 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
-	data := seccompData{
-		nr:                 sysno,
-		arch:               t.tc.st.AuditNumber,
-		instructionPointer: uint64(ip),
+	data := linux.SeccompData{
+		Nr:                 sysno,
+		Arch:               t.tc.st.AuditNumber,
+		InstructionPointer: uint64(ip),
 	}
 	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
 	// we can't do any slicing tricks or even use copy/append here.
 	for i, arg := range args {
-		if i >= len(data.args) {
+		if i >= len(data.Args) {
 			break
 		}
-		data.args[i] = arg.Uint64()
+		data.Args[i] = arg.Uint64()
 	}
-	input := data.asBPFInput()
+	input := dataAsBPFInput(t, &data)
 
 	ret := uint32(linux.SECCOMP_RET_ALLOW)
 	f := t.syscallFilters.Load()
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c00fa1138..c39ecfb8f 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -283,6 +283,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
 	return nil
 }
 
+// GetStat extracts semid_ds information from the set.
+func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return nil, syserror.EACCES
+	}
+
+	ds := &linux.SemidDS{
+		SemPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequence not supported.
+		},
+		SemOTime: s.opTime.TimeT(),
+		SemCTime: s.changeTime.TimeT(),
+		SemNSems: uint64(s.Size()),
+	}
+	return ds, nil
+}
+
 // SetVal overrides a semaphore value, waking up waiters as needed.
 func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
 	if val < 0 || val > valueMax {
@@ -320,7 +347,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	}
 
 	for _, val := range vals {
-		if val < 0 || val > valueMax {
+		if val > valueMax {
 			return syserror.ERANGE
 		}
 	}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 5c4c622c2..df5c8421b 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -16,8 +16,6 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -32,7 +30,7 @@ type ProcessGroupID ThreadID
 //
 // +stateify savable
 type Session struct {
-	refs refs.AtomicRefCount
+	SessionRefs
 
 	// leader is the originator of the Session.
 	//
@@ -62,16 +60,11 @@ type Session struct {
 	sessionEntry
 }
 
-// incRef grabs a reference.
-func (s *Session) incRef() {
-	s.refs.IncRef()
-}
-
-// decRef drops a reference.
+// DecRef drops a reference.
 //
 // Precondition: callers must hold TaskSet.mu for writing.
-func (s *Session) decRef() {
-	s.refs.DecRefWithDestructor(nil, func(context.Context) {
+func (s *Session) DecRef() {
+	s.SessionRefs.DecRef(func() {
 		// Remove translations from the leader.
 		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
 			id := ns.sids[s]
@@ -88,7 +81,7 @@ func (s *Session) decRef() {
 //
 // +stateify savable
 type ProcessGroup struct {
-	refs refs.AtomicRefCount // not exported.
+	refs ProcessGroupRefs
 
 	// originator is the originator of the group.
 	//
@@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 	}
 
 	alive := true
-	pg.refs.DecRefWithDestructor(nil, func(context.Context) {
+	pg.refs.DecRef(func() {
 		alive = false // don't bother with handleOrphan.
 
 		// Remove translations from the originator.
@@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
 
 		// Remove the list of process groups.
 		pg.session.processGroups.Remove(pg)
-		pg.session.decRef()
+		pg.session.DecRef()
 	})
 	if alive {
 		pg.handleOrphan()
@@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
 		id:     SessionID(id),
 		leader: tg,
 	}
-	s.refs.EnableLeakCheck("kernel.Session")
+	s.EnableLeakCheck()
 
 	// Create a new ProcessGroup, belonging to that Session.
 	// This also has a single reference (assigned below).
@@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
 		session:    s,
 		ancestors:  0,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	// Tie them and return the result.
 	s.processGroups.PushBack(pg)
@@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	//
 	// We manually adjust the ancestors if the parent is in the same
 	// session.
-	tg.processGroup.session.incRef()
+	tg.processGroup.session.IncRef()
 	pg := ProcessGroup{
 		id:         ProcessGroupID(id),
 		originator: tg,
 		session:    tg.processGroup.session,
 	}
-	pg.refs.EnableLeakCheck("kernel.ProcessGroup")
+	pg.refs.EnableLeakCheck()
 
 	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
 		pg.ancestors++
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index c211fc8d0..80a592c8f 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,25 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "shm_refs",
+    out = "shm_refs.go",
+    package = "shm",
+    prefix = "Shm",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "Shm",
+    },
+)
+
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
+        "shm_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -14,6 +27,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 13ec7afe0..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -39,7 +39,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		creatorPID:    pid,
 		changeTime:    ktime.NowFromContext(ctx),
 	}
-	shm.EnableLeakCheck("kernel.Shm")
+	shm.EnableLeakCheck()
 
 	// Find the next available ID.
 	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
@@ -322,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
 	r.totalPages -= s.effectiveSize / usermem.PageSize
 }
 
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+	// Because Shm.DecRef() may acquire the same locks, collect the segments to
+	// release first. Note that this should not race with any updates to r, since
+	// the IPC namespace containing it has no more references.
+	toRelease := make([]*Shm, 0)
+	r.mu.Lock()
+	for _, s := range r.keysToShms {
+		s.mu.Lock()
+		if !s.pendingDestruction {
+			toRelease = append(toRelease, s)
+		}
+		s.mu.Unlock()
+	}
+	r.mu.Unlock()
+
+	for _, s := range toRelease {
+		r.dissociateKey(s)
+		s.DecRef(ctx)
+	}
+}
+
 // Shm represents a single shared memory segment.
 //
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
 // Segments are always mapped as a whole, greatly simplifying how mappings are
 // tracked. However note that mremap and munmap calls may cause the vma for a
 // segment to become fragmented; which requires special care when unmapping a
@@ -337,14 +359,14 @@ func (r *Registry) remove(s *Shm) {
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment.
+	// ShmRefs tracks the number of references to this segment.
 	//
 	// A segment holds a reference to itself until it is marked for
 	// destruction.
 	//
 	// In addition to direct users, the MemoryManager will hold references
 	// via MappingIdentity.
-	refs.AtomicRefCount
+	ShmRefs
 
 	mfp pgalloc.MemoryFileProvider
 
@@ -428,11 +450,14 @@ func (s *Shm) InodeID() uint64 {
 	return uint64(s.ID)
 }
 
-// DecRef overrides refs.RefCount.DecRef with a destructor.
+// DecRef drops a reference on s.
 //
 // Precondition: Caller must not hold s.mu.
 func (s *Shm) DecRef(ctx context.Context) {
-	s.DecRefWithDestructor(ctx, s.destroy)
+	s.ShmRefs.DecRef(func() {
+		s.mfp.MemoryFile().DecRef(s.fr)
+		s.registry.remove(s)
+	})
 }
 
 // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
@@ -642,11 +667,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 	return nil
 }
 
-func (s *Shm) destroy(context.Context) {
-	s.mfp.MemoryFile().DecRef(s.fr)
-	s.registry.remove(s)
-}
-
 // MarkDestroyed marks a segment for destruction. The segment is actually
 // destroyed once it has no references. MarkDestroyed may be called multiple
 // times, and is safe to call after a segment has already been destroyed. See
@@ -655,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
 	s.registry.dissociateKey(s)
 
 	s.mu.Lock()
-	defer s.mu.Unlock()
-	if !s.pendingDestruction {
-		s.pendingDestruction = true
-		// Drop the self-reference so destruction occurs when all
-		// external references are gone.
-		//
-		// N.B. This cannot be the final DecRef, as the caller also
-		// holds a reference.
-		s.DecRef(ctx)
+	if s.pendingDestruction {
+		s.mu.Unlock()
 		return
 	}
+	s.pendingDestruction = true
+	s.mu.Unlock()
+
+	// Drop the self-reference so destruction occurs when all
+	// external references are gone.
+	//
+	// N.B. This cannot be the final DecRef, as the caller also
+	// holds a reference.
+	s.DecRef(ctx)
+	return
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 3eb78e91b..76d472292 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index b07e1c1bd..78f718cfe 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -17,7 +17,6 @@ package signalfd
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
@@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	}
 
 	// Copy out the signal info using the specified format.
-	var buf [128]byte
-	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+	infoNative := linux.SignalfdSiginfo{
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
@@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
-	})
-	n, err := dst.CopyOut(ctx, buf[:])
-	return int64(n), err
+	}
+	n, err := infoNative.WriteTo(dst.Writer(ctx))
+	if err == usermem.ErrEndOfIOSequence {
+		// Partial copy-out ok.
+		err = nil
+	}
+	return n, err
 }
 
 // Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 413111faf..332bdb8e8 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string {
 	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
 }
 
+// LookupNo looks up a syscall number by name.
+func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
+	for i, syscall := range s.Table {
+		if syscall.Name == name {
+			return uintptr(i), nil
+		}
+	}
+	return 0, fmt.Errorf("syscall %q not found", name)
+}
+
 // LookupEmulate looks up an emulation syscall number.
 func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 5aee699e7..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -574,6 +574,11 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// kcov is the kcov instance providing code coverage owned by this task.
+	//
+	// kcov is exclusive to the task goroutine.
+	kcov *Kcov
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -651,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxUTSNamespace:
 		return t.utsns
 	case CtxIPCNamespace:
-		return t.ipcns
+		ipcns := t.IPCNamespace()
+		ipcns.IncRef()
+		return ipcns
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
@@ -730,7 +737,6 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 func (t *Task) IsChrooted() bool {
 	if VFS2Enabled {
 		realRoot := t.mountNamespaceVFS2.Root()
-		defer realRoot.DecRef(t)
 		root := t.fsContext.RootDirectoryVFS2()
 		defer root.DecRef(t)
 		return root != realRoot
@@ -863,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.mountNamespaceVFS2.IncRef()
 	return t.mountNamespaceVFS2
 }
 
@@ -903,3 +908,16 @@ func (t *Task) UID() uint32 {
 func (t *Task) GID() uint32 {
 	return uint32(t.Credentials().EffectiveKGID)
 }
+
+// SetKcov sets the kcov instance associated with t.
+func (t *Task) SetKcov(k *Kcov) {
+	t.kcov = k
+}
+
+// ResetKcov clears the kcov instance associated with t.
+func (t *Task) ResetKcov() {
+	if t.kcov != nil {
+		t.kcov.OnTaskExit()
+		t.kcov = nil
+	}
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 9d7a9128f..682080c14 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -203,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
 		ipcns = NewIPCNamespace(userns)
+	} else {
+		ipcns.IncRef()
 	}
+	cu := cleanup.Make(func() {
+		ipcns.DecRef(t)
+	})
+	defer cu.Clean()
 
 	netns := t.NetworkNamespace()
 	if opts.NewNetworkNamespace {
@@ -214,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
 		mntnsVFS2.IncRef()
+		cu.Add(func() {
+			mntnsVFS2.DecRef(t)
+		})
 	}
 
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
 	}
+	cu.Add(func() {
+		tc.release()
+	})
 	// clone() returns 0 in the child.
 	tc.Arch.SetReturn(0)
 	if opts.Stack != 0 {
@@ -295,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
+	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
+	// the cleanup for us.
+	cu.Release()
 	if err != nil {
-		if opts.NewThreadGroup {
-			tg.release(t)
-		}
 		return 0, nil, err
 	}
 
@@ -341,12 +354,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		nt.SetClearTID(opts.ChildTID)
 	}
 	if opts.ChildSetTID {
-		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
-		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+		ctid := nt.ThreadID()
+		ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
 	}
 	ntid := t.tg.pidns.IDOfTask(nt)
 	if opts.ParentSetTID {
-		t.CopyOut(opts.ParentTID, ntid)
+		ntid.CopyOut(t, opts.ParentTID)
 	}
 
 	kind := ptraceCloneKindClone
@@ -509,6 +522,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
+		t.ipcns.DecRef(t)
 		t.ipcns = NewIPCNamespace(creds.UserNamespace)
 	}
 	var oldFDTable *FDTable
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9fa528384..d1136461a 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) Stack() *arch.Stack {
-	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+	return &arch.Stack{
+		Arch:   t.Arch(),
+		IO:     t.MemoryManager(),
+		Bottom: usermem.Addr(t.Arch().Stack()),
+	}
 }
 
 // LoadTaskImage loads a specified file into a new TaskContext.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5e4fb3e3a..412d471d3 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -237,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 // promoteLocked makes t the leader of its thread group. If t is already the
 // thread group leader, promoteLocked is a no-op.
 //
-// Preconditions: All other tasks in t's thread group, including the existing
-// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
-// be locked for writing.
+// Preconditions:
+// * All other tasks in t's thread group, including the existing leader (if it
+//   is not t), have reached TaskExitZombie.
+// * The TaskSet mutex must be locked for writing.
 func (t *Task) promoteLocked() {
 	oldLeader := t.tg.leader
 	if t == oldLeader {
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index c165d6cb1..ce7b9641d 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
+	t.ResetKcov()
+
 	// If the task has a cleartid, and the thread group wasn't killed by a
 	// signal, handle that before releasing the MM.
 	if t.cleartid != 0 {
@@ -246,7 +248,8 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
 		t.tg.signalHandlers.mu.Unlock()
 		if !signaled {
-			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+			zero := ThreadID(0)
+			if _, err := zero.CopyOut(t, t.cleartid); err == nil {
 				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
 			}
 			// If the CopyOut fails, there's nothing we can do.
@@ -277,12 +280,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		t.mountNamespaceVFS2.DecRef(t)
 		t.mountNamespaceVFS2 = nil
 	}
+	t.ipcns.DecRef(t)
 	t.mu.Unlock()
 
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
-		t.tg.release(t)
+		t.tg.Release(t)
 	}
 
 	// Detach tracees.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 4b535c949..c80391475 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -87,7 +88,7 @@ func (t *Task) exitRobustList() {
 		return
 	}
 
-	next := rl.List
+	next := primitive.Uint64(rl.List)
 	done := 0
 	var pendingLockAddr usermem.Addr
 	if rl.ListOpPending != 0 {
@@ -99,12 +100,12 @@ func (t *Task) exitRobustList() {
 		// We traverse to the next element of the list before we
 		// actually wake anything. This prevents the race where waking
 		// this futex causes a modification of the list.
-		thisLockAddr := usermem.Addr(next + rl.FutexOffset)
+		thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset)
 
 		// Try to decode the next element in the list before waking the
 		// current futex. But don't check the error until after we've
 		// woken the current futex. Linux does it in this order too
-		_, nextErr := t.CopyIn(usermem.Addr(next), &next)
+		_, nextErr := next.CopyIn(t, usermem.Addr(next))
 
 		// Wakeup the current futex if it's not pending.
 		if thisLockAddr != pendingLockAddr {
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index aa3a573c0..8dc3fec90 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -141,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error {
 	region := trace.StartRegion(t.traceContext, cpuidRegion)
 	expected := arch.CPUIDInstruction[:]
 	found := make([]byte, len(expected))
-	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	_, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found)
 	if err == nil && bytes.Equal(expected, found) {
 		// Skip the cpuid instruction.
 		t.Arch().CPUIDEmulate(t)
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 09366b60c..52c55d13d 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	}
 }
 
-// Preconditions: The caller must be running on the task goroutine, and leaving
-// a state indicated by a previous call to
-// t.accountTaskGoroutineEnter(state).
+// Preconditions:
+// * The caller must be running on the task goroutine
+// * The caller must be leaving a state indicated by a previous call to
+//   t.accountTaskGoroutineEnter(state).
 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	if state != TaskGoroutineRunningApp {
 		// Task is unblocking/continuing.
@@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
 	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
 }
 
-// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
-// must be locked.
+// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
+// * The TaskSet mutex must be locked.
 func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
 	stats := tg.exitedCPUStats
 	// Account for live tasks.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index d6a2040bc..ebdb83061 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -259,7 +259,11 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	// Set up the signal handler. If we have a saved signal mask, the signal
 	// handler should run with the current mask, but sigreturn should restore
 	// the saved one.
-	st := &arch.Stack{t.Arch(), mm, sp}
+	st := &arch.Stack{
+		Arch:   t.Arch(),
+		IO:     mm,
+		Bottom: sp,
+	}
 	mask := t.signalMask
 	if t.haveSavedSignalMask {
 		mask = t.savedSignalMask
@@ -319,8 +323,9 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 
 // Sigtimedwait implements the semantics of sigtimedwait(2).
 //
-// Preconditions: The caller must be running on the task goroutine. t.exitState
-// < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
 	// set is the set of signals we're interested in; invert it to get the set
 	// of signals to block.
@@ -584,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet {
 
 // SetSignalMask sets t's signal mask.
 //
-// Preconditions: SetSignalMask can only be called by the task goroutine.
-// t.exitState < TaskExitZombie.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * t.exitState < TaskExitZombie.
 func (t *Task) SetSignalMask(mask linux.SignalSet) {
 	// By precondition, t prevents t.tg from completing an execve and mutating
 	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
@@ -631,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
 // comment).
 //
-// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+// Preconditions: The caller must be running on the task goroutine.
 func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
 	t.savedSignalMask = mask
 	t.haveSavedSignalMask = true
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 64c1e120a..8e28230cc 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -98,14 +99,18 @@ type TaskConfig struct {
 // NewTask creates a new task defined by cfg.
 //
 // NewTask does not start the returned task; the caller must call Task.Start.
-func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+//
+// If successful, NewTask transfers references held by cfg to the new task.
+// Otherwise, NewTask releases them.
+func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
 	t, err := ts.newTask(cfg)
 	if err != nil {
 		cfg.TaskContext.release()
-		cfg.FSContext.DecRef(t)
-		cfg.FDTable.DecRef(t)
+		cfg.FSContext.DecRef(ctx)
+		cfg.FDTable.DecRef(ctx)
+		cfg.IPCNamespace.DecRef(ctx)
 		if cfg.MountNamespaceVFS2 != nil {
-			cfg.MountNamespaceVFS2.DecRef(t)
+			cfg.MountNamespaceVFS2.DecRef(ctx)
 		}
 		return nil, err
 	}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 296735d32..a35948a5f 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -99,8 +99,9 @@ type TaskStop interface {
 
 // beginInternalStop indicates the start of an internal stop that applies to t.
 //
-// Preconditions: The task must not already be in an internal stop (i.e. t.stop
-// == nil). The caller must be running on the task goroutine.
+// Preconditions:
+// * The caller must be running on the task goroutine.
+// * The task must not already be in an internal stop (i.e. t.stop == nil).
 func (t *Task) beginInternalStop(s TaskStop) {
 	t.tg.pidns.owner.mu.RLock()
 	defer t.tg.pidns.owner.mu.RUnlock()
@@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) {
 	t.beginInternalStopLocked(s)
 }
 
-// Preconditions: The signal mutex must be locked. All preconditions for
-// Task.beginInternalStop also apply.
+// Preconditions: Same as beginInternalStop, plus:
+// * The signal mutex must be locked.
 func (t *Task) beginInternalStopLocked(s TaskStop) {
 	if t.stop != nil {
 		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
@@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) {
 // t.stop, which is why there is no endInternalStop that locks the signal mutex
 // for you.
 //
-// Preconditions: The signal mutex must be locked. The task must be in an
-// internal stop (i.e. t.stop != nil).
+// Preconditions:
+// * The signal mutex must be locked.
+// * The task must be in an internal stop (i.e. t.stop != nil).
 func (t *Task) endInternalStopLocked() {
 	if t.stop == nil {
 		panic("Attempting to leave non-existent internal stop")
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2dbf86547..0141459e7 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -22,6 +22,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -287,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 
 	// Grab the caller up front, to make sure there's a sensible stack.
 	caller := t.Arch().Native(uintptr(0))
-	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+	if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil {
 		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
@@ -323,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 type runVsyscallAfterPtraceEventSeccomp struct {
 	addr   usermem.Addr
 	sysno  uintptr
-	caller interface{}
+	caller marshal.Marshallable
 }
 
 func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
@@ -346,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
 	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
 }
 
-func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
 	rval, ctrl, err := t.executeSyscall(sysno, args)
 	if ctrl != nil {
 		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index b02044ad2..ce134bf54 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -43,17 +44,6 @@ func (t *Task) Deactivate() {
 	}
 }
 
-// CopyIn copies a fixed-size value or slice of fixed-size values in from the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not readable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
-	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-}
-
 // CopyInBytes is a fast version of CopyIn if the caller can serialize the
 // data without reflection and pass in a byte slice.
 //
@@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
 	})
 }
 
-// CopyOut copies a fixed-size value or slice of fixed-size values out to the
-// task's memory. The copy will fail with syscall.EFAULT if it traverses user
-// memory that is unmapped or not writeable by the user.
-//
-// This Task's AddressSpace must be active.
-func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
-	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-}
-
 // CopyOutBytes is a fast version of CopyOut if the caller can serialize the
 // data without reflection and pass in a byte slice.
 //
@@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 	var v []string
 	for {
 		argAddr := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, argAddr); err != nil {
+		if _, err := argAddr.CopyIn(t, addr); err != nil {
 			return v, err
 		}
 		if t.Arch().Value(argAddr) == 0 {
@@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([
 // CopyOutIovecs converts src to an array of struct iovecs and copies it to the
 // memory mapped at addr.
 //
-// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyOut, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
 	switch t.Arch().Width() {
 	case 8:
@@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // combined length of all AddrRanges would otherwise exceed this amount, ranges
 // beyond MAX_RW_COUNT are silently truncated.
 //
-// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
-// task goroutine. t's AddressSpace must be active.
+// Preconditions: Same as usermem.IO.CopyIn, plus:
+// * The caller must be running on the task goroutine.
+// * t's AddressSpace must be active.
 func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
 	if numIovecs == 0 {
 		return usermem.AddrRangeSeq{}, nil
@@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp
 //
 // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
 //
-// Preconditions: As for Task.CopyInIovecs.
+// Preconditions: Same as Task.CopyInIovecs.
 func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return usermem.IOSequence{}, syserror.EINVAL
@@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
 		Opts:  opts,
 	}, nil
 }
+
+// copyContext implements marshal.CopyContext. It wraps a task to allow copying
+// memory to and from the task memory with custom usermem.IOOpts.
+type copyContext struct {
+	*Task
+	opts usermem.IOOpts
+}
+
+// AsCopyContext wraps the task and returns it as CopyContext.
+func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
+	return &copyContext{t, opts}
+}
+
+// CopyInString copies a string in from the task's memory.
+func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+}
+
+// CopyInBytes copies task memory into dst from an IO context.
+func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+}
+
+// CopyOutBytes copies src into task memoryfrom an IO context.
+func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 0b34c0099..a183b28c1 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
 	return tg.limits
 }
 
-// release releases the thread group's resources.
-func (tg *ThreadGroup) release(t *Task) {
+// Release releases the thread group's resources.
+func (tg *ThreadGroup) Release(ctx context.Context) {
 	// Timers must be destroyed without holding the TaskSet or signal mutexes
 	// since timers send signals with Timer.mu locked.
 	tg.itimerRealTimer.Destroy()
@@ -325,7 +326,7 @@ func (tg *ThreadGroup) release(t *Task) {
 		it.DestroyTimer()
 	}
 	if tg.mounts != nil {
-		tg.mounts.DecRef(t)
+		tg.mounts.DecRef(ctx)
 	}
 }
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 872e1a82d..fdadb52c0 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -36,6 +36,8 @@ import (
 const TasksLimit = (1 << 16)
 
 // ThreadID is a generic thread identifier.
+//
+// +marshal
 type ThreadID int32
 
 // String returns a decimal representation of the ThreadID.
@@ -263,6 +265,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
 	return tasks
 }
 
+// NumTasks returns the number of tasks in ns.
+func (ns *PIDNamespace) NumTasks() int {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return len(ns.tids)
+}
+
 // ThreadGroups returns a snapshot of the thread groups in ns.
 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
 	return ns.ThreadGroupsAppend(nil)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index e959700f2..f61a8e164 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) {
 // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
 // starts the timer, while setting s.Enabled to false stops it.
 //
-// Preconditions: The Timer must not be paused. f cannot call any Timer methods
-// since it is called with the Timer mutex locked.
+// Preconditions:
+// * The Timer must not be paused.
+// * f cannot call any Timer methods since it is called with the Timer mutex
+//   locked.
 func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	now := t.clock.Now()
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 290c32466..9bc452e67 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -17,7 +17,6 @@ package kernel
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -28,6 +27,8 @@ import (
 //
 // They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
 // which also includes a sequence counter.
+//
+// +marshal
 type vdsoParams struct {
 	monotonicReady      uint64
 	monotonicBaseCycles int64
@@ -68,21 +69,29 @@ type VDSOParamPage struct {
 	// checked in state_test_util tests, causing this field to change across
 	// save / restore.
 	seq uint64
+
+	// copyScratchBuffer is a temporary buffer used to marshal the params before
+	// copying it to the real parameter page. The parameter page is typically
+	// updated at a moderate frequency of ~O(seconds) throughout the lifetime of
+	// the sentry, so reusing this buffer is a good tradeoff between memory
+	// usage and the cost of allocation.
+	copyScratchBuffer []byte
 }
 
 // NewVDSOParamPage returns a VDSOParamPage.
 //
 // Preconditions:
-//
 // * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
 //   not take ownership of fr; it must remain allocated for the lifetime of the
 //   VDSOParamPage.
-//
 // * VDSOParamPage must be the only writer to fr.
-//
 // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
 func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
-	return &VDSOParamPage{mfp: mfp, fr: fr}
+	return &VDSOParamPage{
+		mfp:               mfp,
+		fr:                fr,
+		copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()),
+	}
 }
 
 // access returns a mapping of the param page.
@@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error {
 
 	// Get the new params.
 	p := f()
-	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+	buf := v.copyScratchBuffer[:p.SizeBytes()]
+	p.MarshalUnsafe(buf)
 
 	// Skip the sequence counter.
 	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 77e1fe217..0bade6e57 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -33,3 +33,12 @@ func FromContext(ctx context.Context) *LimitSet {
 	}
 	return nil
 }
+
+// FromContextOrDie returns FromContext(ctx) if the latter is not nil.
+// Otherwise, panic is triggered.
+func FromContextOrDie(ctx context.Context) *LimitSet {
+	if v := ctx.Value(CtxLimits); v != nil {
+		return v.(*LimitSet)
+	}
+	panic("failed to create limit set from context")
+}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 20dd1cc21..98af2cc38 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -194,6 +194,10 @@ func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
 		log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
 		return elfInfo{}, syserror.ENOEXEC
 	}
+	if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 {
+		ctx.Infof("Unsupported phdr offset %d", hdr.Phoff)
+		return elfInfo{}, syserror.ENOEXEC
+	}
 
 	phdrBuf := make([]byte, totalPhdrSize)
 	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
@@ -402,8 +406,7 @@ type loadedELF struct {
 //
 // It does not load the ELF interpreter, or return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
@@ -438,6 +441,10 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
 				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
 				return loadedELF{}, syserror.ENOEXEC
 			}
+			if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 {
+				ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off)
+				return loadedELF{}, syserror.ENOEXEC
+			}
 
 			path := make([]byte, phdr.Filesz)
 			_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
@@ -571,8 +578,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in
 // It does not load the ELF interpreter, or return any auxv entries.
 //
 // Preconditions:
-//  * f is an ELF file
-//  * f is the first ELF loaded into m
+// * f is an ELF file.
+// * f is the first ELF loaded into m.
 func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -609,8 +616,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // It does not return any auxv entries.
 //
-// Preconditions:
-//  * f is an ELF file
+// Preconditions: f is an ELF file.
 func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
@@ -640,8 +646,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil
 // If loadELF returns ErrSwitchFile it should be called again with the returned
 // path and argv.
 //
-// Preconditions:
-//  * args.File is an ELF file
+// Preconditions: args.File is an ELF file.
 func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
 	bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
 	if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 8d6802ea3..c69b62db9 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -122,7 +122,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
 	if err != nil {
 		return nil, err
 	}
-	return &arch.Stack{a, m, ar.End}, nil
+	return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil
 }
 
 const (
@@ -215,8 +215,8 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 // path and argv.
 //
 // Preconditions:
-//  * The Task MemoryManager is empty.
-//  * Load is called on the Task goroutine.
+// * The Task MemoryManager is empty.
+// * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
 	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
@@ -247,20 +247,20 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	}
 
 	// Push the original filename to the stack, for AT_EXECFN.
-	execfn, err := stack.Push(args.Filename)
-	if err != nil {
+	if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
 	}
+	execfn := stack.Bottom
 
 	// Push 16 random bytes on the stack which AT_RANDOM will point to.
 	var b [16]byte
 	if _, err := rand.Read(b[:]); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
-	random, err := stack.Push(b)
-	if err != nil {
+	if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
+	random := stack.Bottom
 
 	c := auth.CredentialsFromContext(ctx)
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 05a294fe6..241d87835 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -380,3 +380,9 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF)
 
 	return vdsoAddr, nil
 }
+
+// Release drops references on mappings held by v.
+func (v *VDSO) Release(ctx context.Context) {
+	v.ParamPage.DecRef(ctx)
+	v.vdso.DecRef(ctx)
+}
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index d609c1ae0..457ed87f8 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -177,7 +177,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
 // AddMapping adds the given mapping and returns the set of MappableRanges that
 // previously had no mappings.
 //
-// Preconditions: As for Mappable.AddMapping.
+// Preconditions: Same as Mappable.AddMapping.
 func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var mapped []MappableRange
@@ -204,7 +204,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
 // RemoveMapping removes the given mapping and returns the set of
 // MappableRanges that now have no mappings.
 //
-// Preconditions: As for Mappable.RemoveMapping.
+// Preconditions: Same as Mappable.RemoveMapping.
 func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var unmapped []MappableRange
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 65d83096f..7fd77925f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -28,9 +28,9 @@ import (
 //
 // See mm/mm.go for Mappable's place in the lock order.
 //
-// Preconditions: For all Mappable methods, usermem.AddrRanges and
-// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and
-// Mappable offsets must be page-aligned.
+// All Mappable methods have the following preconditions:
+// * usermem.AddrRanges and MappableRanges must be non-empty (Length() != 0).
+// * usermem.Addrs and Mappable offsets must be page-aligned.
 type Mappable interface {
 	// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
 	// offsets [offset, offset+ar.Length()) in this Mappable.
@@ -48,8 +48,10 @@ type Mappable interface {
 	// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
 	// Mappable.
 	//
-	// Preconditions: offset+ar.Length() does not overflow. The removed mapping
-	// must exist. writable must match the corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+ar.Length() does not overflow.
+	// * The removed mapping must exist. writable must match the
+	//   corresponding call to AddMapping.
 	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool)
 
 	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
@@ -60,9 +62,10 @@ type Mappable interface {
 	// CopyMapping is only called when a mapping is copied within a given
 	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
 	//
-	// Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
-	// overflow. The mapping at srcAR must exist. writable must match the
-	// corresponding call to AddMapping.
+	// Preconditions:
+	// * offset+srcAR.Length() and offset+dstAR.Length() do not overflow.
+	// * The mapping at srcAR must exist. writable must match the
+	//   corresponding call to AddMapping.
 	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error
 
 	// Translate returns the Mappable's current mappings for at least the range
@@ -77,11 +80,14 @@ type Mappable interface {
 	// reference is held on all pages in a File that may be the result
 	// of a valid Translation.
 	//
-	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
-	// required and optional must be page-aligned. The caller must have
-	// established a mapping for all of the queried offsets via a previous call
-	// to AddMapping. The caller is responsible for ensuring that calls to
-	// Translate synchronize with invalidation.
+	// Preconditions:
+	// * required.Length() > 0.
+	// * optional.IsSupersetOf(required).
+	// * required and optional must be page-aligned.
+	// * The caller must have established a mapping for all of the queried
+	//   offsets via a previous call to AddMapping.
+	// * The caller is responsible for ensuring that calls to Translate
+	//   synchronize with invalidation.
 	//
 	// Postconditions: See CheckTranslateResult.
 	Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error)
@@ -118,10 +124,10 @@ func (t Translation) FileRange() FileRange {
 // CheckTranslateResult returns an error if (ts, terr) does not satisfy all
 // postconditions for Mappable.Translate(required, optional, at).
 //
-// Preconditions: As for Mappable.Translate.
+// Preconditions: Same as Mappable.Translate.
 func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error {
 	// Verify that the inputs to Mappable.Translate were valid.
-	if !required.WellFormed() || required.Length() <= 0 {
+	if !required.WellFormed() || required.Length() == 0 {
 		panic(fmt.Sprintf("invalid required range: %v", required))
 	}
 	if !usermem.Addr(required.Start).IsPageAligned() || !usermem.Addr(required.End).IsPageAligned() {
@@ -139,7 +145,7 @@ func CheckTranslateResult(required, optional MappableRange, at usermem.AccessTyp
 		return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required)
 	}
 	for i, t := range ts {
-		if !t.Source.WellFormed() || t.Source.Length() <= 0 {
+		if !t.Source.WellFormed() || t.Source.Length() == 0 {
 			return fmt.Errorf("Translation %+v has invalid Source", t)
 		}
 		if !usermem.Addr(t.Source.Start).IsPageAligned() || !usermem.Addr(t.Source.End).IsPageAligned() {
@@ -214,7 +220,9 @@ type MappingSpace interface {
 	// Invalidate must not take any locks preceding mm.MemoryManager.activeMu
 	// in the lock order.
 	//
-	// Preconditions: ar.Length() != 0. ar must be page-aligned.
+	// Preconditions:
+	// * ar.Length() != 0.
+	// * ar must be page-aligned.
 	Invalidate(ar usermem.AddrRange, opts InvalidateOpts)
 }
 
@@ -375,16 +383,20 @@ type File interface {
 
 	// IncRef increments the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr. (The File
-	// interface does not provide a way to acquire an initial reference;
-	// implementors may define mechanisms for doing so.)
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr. (The File
+	//   interface does not provide a way to acquire an initial reference;
+	//   implementors may define mechanisms for doing so.)
 	IncRef(fr FileRange)
 
 	// DecRef decrements the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. At least one reference must be held on all pages in fr.
+	// Preconditions:
+	// * fr.Start and fr.End must be page-aligned.
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	DecRef(fr FileRange)
 
 	// MapInternal returns a mapping of the given file offsets in the invoking
@@ -392,8 +404,9 @@ type File interface {
 	//
 	// Note that fr.Start and fr.End need not be page-aligned.
 	//
-	// Preconditions: fr.Length() > 0. At least one reference must be held on
-	// all pages in fr.
+	// Preconditions:
+	// * fr.Length() > 0.
+	// * At least one reference must be held on all pages in fr.
 	//
 	// Postconditions: The returned mapping is valid as long as at least one
 	// reference is held on the mapped pages.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index f9d0837a1..6dbeccfe2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -73,12 +73,35 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "aio_mappable_refs",
+    out = "aio_mappable_refs.go",
+    package = "mm",
+    prefix = "aioMappable",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "aioMappable",
+    },
+)
+
+go_template_instance(
+    name = "special_mappable_refs",
+    out = "special_mappable_refs.go",
+    package = "mm",
+    prefix = "SpecialMappable",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "SpecialMappable",
+    },
+)
+
 go_library(
     name = "mm",
     srcs = [
         "address_space.go",
         "aio_context.go",
         "aio_context_state.go",
+        "aio_mappable_refs.go",
         "debug.go",
         "file_refcount_set.go",
         "io.go",
@@ -92,6 +115,7 @@ go_library(
         "save_restore.go",
         "shm.go",
         "special_mappable.go",
+        "special_mappable_refs.go",
         "syscalls.go",
         "vma.go",
         "vma_set.go",
@@ -103,6 +127,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safecopy",
         "//pkg/safemem",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 5c667117c..a93e76c75 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -166,8 +166,12 @@ func (mm *MemoryManager) Deactivate() {
 // mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
 // for all addresses in ar should be precommitted.
 //
-// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * mm.as != nil.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
+// * pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 16fea53c4..7bf48cb2c 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -17,7 +17,6 @@ package mm
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -239,7 +238,7 @@ func (ctx *AIOContext) Drain() {
 //
 // +stateify savable
 type aioMappable struct {
-	refs.AtomicRefCount
+	aioMappableRefs
 
 	mfp pgalloc.MemoryFileProvider
 	fr  memmap.FileRange
@@ -253,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
 		return nil, err
 	}
 	m := aioMappable{mfp: mfp, fr: fr}
-	m.EnableLeakCheck("mm.aioMappable")
+	m.EnableLeakCheck()
 	return &m, nil
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *aioMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.aioMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index fa776f9c6..a8ac48080 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -441,7 +441,10 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
-// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+// Preconditions:
+// * mm.as != nil.
+// * ioar.Length() != 0.
+// * ioar.Contains(addr).
 func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
 	// Try to map all remaining pages in the I/O operation. This RoundUp can't
 	// overflow because otherwise it would have been caught by CheckIORange.
@@ -629,7 +632,9 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
 // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
 // truncate usermem.AddrRangeSeq when errors occur.
 //
-// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+// Preconditions:
+// * !arsit.IsEmpty().
+// * end <= arsit.Head().End.
 func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
 	ar := arsit.Head()
 	if end <= ar.Start {
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3e85964e4..92cc87d84 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -235,6 +235,20 @@ type MemoryManager struct {
 
 	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
 	vdsoSigReturnAddr uint64
+
+	// membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
+	// previously been called. Since, as of this writing,
+	// MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
+	// barrier, membarrierPrivateEnabled has no other effect.
+	//
+	// membarrierPrivateEnabled is accessed using atomic memory operations.
+	membarrierPrivateEnabled uint32
+
+	// membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
+	// been called.
+	//
+	// membarrierRSeqEnabled is accessed using atomic memory operations.
+	membarrierRSeqEnabled uint32
 }
 
 // vma represents a virtual memory area.
@@ -242,7 +256,7 @@ type MemoryManager struct {
 // +stateify savable
 type vma struct {
 	// mappable is the virtual memory object mapped by this vma. If mappable is
-	// nil, the vma represents a private anonymous mapping.
+	// nil, the vma represents an anonymous mapping.
 	mappable memmap.Mappable
 
 	// off is the offset into mappable at which this vma begins. If mappable is
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index fdc308542..acac3d357 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) {
 	defer mm.DecUsers(ctx)
 
 	addr, err := mm.MMap(ctx, memmap.MMapOpts{
-		Length: 2 * usermem.PageSize,
+		Length:  2 * usermem.PageSize,
+		Private: true,
 	})
 	if err != nil {
 		t.Fatalf("MMap got err %v want nil", err)
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 930ec895f..7e5f7de64 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -31,10 +31,12 @@ import (
 // iterator to the pma containing ar.Start. Otherwise it returns a terminal
 // iterator.
 //
-// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -89,13 +91,16 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user
 //
 // - An error that is non-nil if pmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
-// for all addresses in ar, and support accesses of type at (i.e. permission
-// checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * vseg.Range().Contains(ar.Start).
+// * vmas must exist for all addresses in ar, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Ok() {
@@ -135,9 +140,11 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
 // exist. If this is not equal to ars, it returns a non-nil error explaining
 // why.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vmas must exist for all addresses in ars, and support accesses of
-// type at (i.e. permission checks must have been performed against vmas).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
+// * vmas must exist for all addresses in ars, and support accesses of type at
+//   (i.e. permission checks must have been performed against vmas).
 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
 	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
 		ar := arsit.Head()
@@ -186,7 +193,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
 // getVecPMAsLocked; other clients should call one of those instead.
 func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Ok() {
@@ -216,7 +223,7 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter
 				// Need a pma here.
 				optAR := vseg.Range().Intersect(pgap.Range())
 				if checkInvariants {
-					if optAR.Length() <= 0 {
+					if optAR.Length() == 0 {
 						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
 					}
 				}
@@ -518,8 +525,10 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
 // and update the pma to indicate that it does not require copy-on-write.
 //
-// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
-// locked. mm.activeMu must be locked for writing.
+// Preconditions:
+// * vseg.Range().IsSupersetOf(pseg.Range()).
+// * mm.mappingMu must be locked.
+// * mm.activeMu must be locked for writing.
 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
 	pma := pseg.ValuePtr()
 	if !pma.needCOW {
@@ -551,7 +560,7 @@ func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterat
 // Invalidate implements memmap.MappingSpace.Invalidate.
 func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -568,11 +577,13 @@ func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.Invalidate
 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for
 // addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -613,10 +624,12 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
 // most I/O. It should only be used in contexts that would use get_user_pages()
 // in the Linux kernel.
 //
-// Preconditions: ar.Length() != 0. ar must be page-aligned.
+// Preconditions:
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -693,15 +706,19 @@ func Unpin(prs []PinnedRange) {
 
 // movePMAsLocked moves all pmas in oldAR to newAR.
 //
-// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
-// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
-// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * oldAR.Length() != 0.
+// * oldAR.Length() <= newAR.Length().
+// * !oldAR.Overlaps(newAR).
+// * mm.pmas.IsEmptyRange(newAR).
+// * oldAR and newAR must be page-aligned.
 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 	if checkInvariants {
-		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+		if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
 			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
 		}
-		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+		if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
 			panic(fmt.Sprintf("invalid newAR: %v", newAR))
 		}
 		if oldAR.Length() > newAR.Length() {
@@ -751,15 +768,17 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 // - An error that is non-nil if internal mappings exist for only a subset of
 // ar.
 //
-// Preconditions: mm.activeMu must be locked for writing.
-// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
-// ar.Length() != 0.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pseg.Range().Contains(ar.Start).
+// * pmas must exist for all addresses in ar.
+// * ar.Length() != 0.
 //
 // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
 func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().Contains(ar.Start) {
@@ -783,8 +802,9 @@ func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar userm
 // internal mappings exist. If this is not equal to ars, it returns a non-nil
 // error explaining why.
 //
-// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
-// all addresses in ar.
+// Preconditions:
+// * mm.activeMu must be locked for writing.
+// * pmas must exist for all addresses in ar.
 //
 // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
 // into mm.pmas.
@@ -803,12 +823,15 @@ func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSe
 
 // internalMappingsLocked returns internal mappings for addresses in ar.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ar. ar.Length() != 0.
-// pseg.Range().Contains(ar.Start).
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ar.
+// * ar.Length() != 0.
+// * pseg.Range().Contains(ar.Start).
 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().Contains(ar.Start) {
@@ -839,8 +862,10 @@ func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.Add
 
 // vecInternalMappingsLocked returns internal mappings for addresses in ars.
 //
-// Preconditions: mm.activeMu must be locked. Internal mappings must have been
-// previously established for all addresses in ars.
+// Preconditions:
+// * mm.activeMu must be locked.
+// * Internal mappings must have been previously established for all addresses
+//   in ars.
 func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
 	var ims []safemem.Block
 	for ; !ars.IsEmpty(); ars = ars.Tail() {
@@ -969,7 +994,9 @@ func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (p
 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
 // so by scanning linearly backward from pgap.
 //
-// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+// Preconditions:
+// * mm.activeMu must be locked.
+// * addr <= pgap.Start().
 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
 	if checkInvariants {
 		if !pgap.Ok() {
@@ -1015,13 +1042,15 @@ func (pseg pmaIterator) fileRange() memmap.FileRange {
 	return pseg.fileRangeOf(pseg.Range())
 }
 
-// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+// Preconditions:
+// * pseg.Range().IsSupersetOf(ar).
+// * ar.Length != 0.
 func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange {
 	if checkInvariants {
 		if !pseg.Ok() {
 			panic("terminal pma iterator")
 		}
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !pseg.Range().IsSupersetOf(ar) {
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 4cdb52eb6..2dbe5b751 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -16,7 +16,6 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -31,7 +30,7 @@ import (
 //
 // +stateify savable
 type SpecialMappable struct {
-	refs.AtomicRefCount
+	SpecialMappableRefs
 
 	mfp  pgalloc.MemoryFileProvider
 	fr   memmap.FileRange
@@ -45,13 +44,13 @@ type SpecialMappable struct {
 // Preconditions: fr.Length() != 0.
 func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
 	m := SpecialMappable{mfp: mfp, fr: fr, name: name}
-	m.EnableLeakCheck("mm.SpecialMappable")
+	m.EnableLeakCheck()
 	return &m
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *SpecialMappable) DecRef(ctx context.Context) {
-	m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
+	m.SpecialMappableRefs.DecRef(func() {
 		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
@@ -137,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 {
 // NewSharedAnonMappable returns a SpecialMappable that implements the
 // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
 //
-// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
-// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
-// do the same to get non-zero device and inode IDs.
+// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
+// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
+// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
+// and causing memory for shared anonymous mappings to be allocated up-front
+// instead of on first touch; this is to avoid exacerbating the fs.MountSource
+// leak (b/143656263). Delete this function along with VFS1.
 func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
 	if length == 0 {
 		return nil, syserror.EINVAL
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index e74d4e1c1..675efdc7c 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -17,6 +17,7 @@ package mm
 import (
 	"fmt"
 	mrand "math/rand"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -24,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -93,18 +93,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		}
 	} else {
 		opts.Offset = 0
-		if !opts.Private {
-			if opts.MappingIdentity != nil {
-				return 0, syserror.EINVAL
-			}
-			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
-			if err != nil {
-				return 0, err
-			}
-			defer m.DecRef(ctx)
-			opts.MappingIdentity = m
-			opts.Mappable = m
-		}
 	}
 
 	if opts.Addr.RoundDown() != opts.Addr {
@@ -166,7 +154,9 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 // populateVMA obtains pmas for addresses in ar in the given vma, and maps them
 // into mm.as if it is active.
 //
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * vseg.Range().IsSupersetOf(ar).
 func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
@@ -208,8 +198,9 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 // preferable to populateVMA since it unlocks mm.mappingMu before performing
 // expensive operations that don't require it to be locked.
 //
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * vseg.Range().IsSupersetOf(ar).
 //
 // Postconditions: mm.mappingMu will be unlocked.
 func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
@@ -1284,3 +1275,27 @@ func (mm *MemoryManager) VirtualDataSize() uint64 {
 	defer mm.mappingMu.RUnlock()
 	return mm.dataAS
 }
+
+// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierPrivate() {
+	atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
+}
+
+// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
+	return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
+}
+
+// EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
+// return true.
+func (mm *MemoryManager) EnableMembarrierRSeq() {
+	atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1)
+}
+
+// IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
+// previously been called.
+func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
+	return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index c4e1989ed..b8df72813 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -27,8 +27,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
-// as defined by the checks in MMap.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * opts must be valid as defined by the checks in MMap.
 func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
 	if opts.MaxPerms != opts.MaxPerms.Effective() {
 		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
@@ -260,11 +261,12 @@ func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
 //
 // - An error that is non-nil if vmas exist for only a subset of ar.
 //
-// Preconditions: mm.mappingMu must be locked for reading; it may be
-// temporarily unlocked. ar.Length() != 0.
+// Preconditions:
+// * mm.mappingMu must be locked for reading; it may be temporarily unlocked.
+// * ar.Length() != 0.
 func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -342,11 +344,13 @@ const guardBytes = 256 * usermem.PageSize
 // unmapLocked unmaps all addresses in ar and returns the resulting gap in
 // mm.vmas.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
-// ar must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -361,11 +365,13 @@ func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange)
 // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
 // must do so before calling removeVMAsLocked.
 //
-// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned.
+// Preconditions:
+// * mm.mappingMu must be locked for writing.
+// * ar.Length() != 0.
+// * ar must be page-aligned.
 func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
 	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 	}
@@ -467,7 +473,9 @@ func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (v
 	return v, v2
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().Contains(addr).
 func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -491,8 +499,10 @@ func (vseg vmaIterator) mappableRange() memmap.MappableRange {
 	return vseg.mappableRangeOf(vseg.Range())
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.Range().IsSupersetOf(ar).
+// * ar.Length() != 0.
 func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -501,7 +511,7 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
 		if vseg.ValuePtr().mappable == nil {
 			panic("MappableRange is meaningless for anonymous vma")
 		}
-		if !ar.WellFormed() || ar.Length() <= 0 {
+		if !ar.WellFormed() || ar.Length() == 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
 		if !vseg.Range().IsSupersetOf(ar) {
@@ -514,8 +524,10 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan
 	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
 }
 
-// Preconditions: vseg.ValuePtr().mappable != nil.
-// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+// Preconditions:
+// * vseg.ValuePtr().mappable != nil.
+// * vseg.mappableRange().IsSupersetOf(mr).
+// * mr.Length() != 0.
 func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 	if checkInvariants {
 		if !vseg.Ok() {
@@ -524,7 +536,7 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 		if vseg.ValuePtr().mappable == nil {
 			panic("MappableRange is meaningless for anonymous vma")
 		}
-		if !mr.WellFormed() || mr.Length() <= 0 {
+		if !mr.WellFormed() || mr.Length() == 0 {
 			panic(fmt.Sprintf("invalid mr: %v", mr))
 		}
 		if !vseg.mappableRange().IsSupersetOf(mr) {
@@ -540,7 +552,9 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
 // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
 // scanning linearly forward from vseg.
 //
-// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+// Preconditions:
+// * mm.mappingMu must be locked.
+// * addr >= vseg.Start().
 func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
 	if checkInvariants {
 		if !vseg.Ok() {
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7a3311a70..5b09b9feb 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -83,6 +83,7 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
         "//pkg/memutil",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 46d3be58c..7c297fb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -29,6 +29,7 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
@@ -224,6 +225,18 @@ type usageInfo struct {
 	refs uint64
 }
 
+// canCommit returns true if the tracked region can be committed.
+func (u *usageInfo) canCommit() bool {
+	// refs must be greater than 0 because we assume that reclaimable pages
+	// (that aren't already known to be committed) are not committed. This
+	// isn't necessarily true, even after the reclaimer does Decommit(),
+	// because the kernel may subsequently back the hugepage-sized region
+	// containing the decommitted page with a hugepage. However, it's
+	// consistent with our treatment of unallocated pages, which have the same
+	// property.
+	return !u.knownCommitted && u.refs != 0
+}
+
 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
 // may be asked to deallocate that memory in the presence of memory pressure.
 type EvictableMemoryUser interface {
@@ -507,7 +520,9 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6
 // nearest page. If this is shorter than length bytes due to an error returned
 // by r.ReadToBlocks(), it returns that error.
 //
-// Preconditions: length > 0. length must be page-aligned.
+// Preconditions:
+// * length > 0.
+// * length must be page-aligned.
 func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
 	fr, err := f.Allocate(length, kind)
 	if err != nil {
@@ -826,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error {
 		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
 		return nil
 	}
+	// Linux updates usage values at CONFIG_HZ.
+	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
+		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
+		return nil
+	}
 
 	f.usageLast = time.Now()
 	err = f.updateUsageLocked(currentUsage, mincore)
@@ -839,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error {
 // pages by invoking checkCommitted, which is a function that, for each page i
 // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
 //
-// Precondition: f.mu must be held.
+// Precondition: f.mu must be held; it may be unlocked and reacquired.
 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
 	// Track if anything changed to elide the merge. In the common case, we
 	// expect all segments to be committed and no merge to occur.
@@ -866,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 		} else if f.usageSwapped != 0 {
 			// We have more usage accounted for than the file itself.
 			// That's fine, we probably caught a race where pages were
-			// being committed while the above loop was running. Just
+			// being committed while the below loop was running. Just
 			// report the higher number that we found and ignore swap.
 			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
 			f.usageSwapped = 0
@@ -878,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 
 	// Iterate over all usage data. There will only be usage segments
 	// present when there is an associated reference.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		val := seg.Value()
-
-		// Already known to be committed; ignore.
-		if val.knownCommitted {
-			continue
-		}
-
-		// Assume that reclaimable pages (that aren't already known to be
-		// committed) are not committed. This isn't necessarily true, even
-		// after the reclaimer does Decommit(), because the kernel may
-		// subsequently back the hugepage-sized region containing the
-		// decommitted page with a hugepage. However, it's consistent with our
-		// treatment of unallocated pages, which have the same property.
-		if val.refs == 0 {
+	for seg := f.usage.FirstSegment(); seg.Ok(); {
+		if !seg.ValuePtr().canCommit() {
+			seg = seg.NextSegment()
 			continue
 		}
 
@@ -915,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 			}
 
 			// Query for new pages in core.
-			if err := checkCommitted(s, buf); err != nil {
+			// NOTE(b/165896008): mincore (which is passed as checkCommitted)
+			// by f.UpdateUsage() might take a really long time. So unlock f.mu
+			// while checkCommitted runs.
+			f.mu.Unlock()
+			err := checkCommitted(s, buf)
+			f.mu.Lock()
+			if err != nil {
 				checkErr = err
 				return
 			}
 
 			// Scan each page and switch out segments.
-			populatedRun := false
-			populatedRunStart := 0
-			for i := 0; i <= bufLen; i++ {
-				// We run past the end of the slice here to
-				// simplify the logic and only set populated if
-				// we're still looking at elements.
-				populated := false
-				if i < bufLen {
-					populated = buf[i]&0x1 != 0
-				}
-
-				switch {
-				case populated == populatedRun:
-					// Keep the run going.
-					continue
-				case populated && !populatedRun:
-					// Begin the run.
-					populatedRun = true
-					populatedRunStart = i
-					// Keep going.
+			seg := f.usage.LowerBoundSegment(r.Start)
+			for i := 0; i < bufLen; {
+				if buf[i]&0x1 == 0 {
+					i++
 					continue
-				case !populated && populatedRun:
-					// Finish the run by changing this segment.
-					runRange := memmap.FileRange{
-						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
-						End:   r.Start + uint64(i*usermem.PageSize),
+				}
+				// Scan to the end of this committed range.
+				j := i + 1
+				for ; j < bufLen; j++ {
+					if buf[j]&0x1 == 0 {
+						break
 					}
-					seg = f.usage.Isolate(seg, runRange)
-					seg.ValuePtr().knownCommitted = true
-					// Advance the segment only if we still
-					// have work to do in the context of
-					// the original segment from the for
-					// loop. Otherwise, the for loop itself
-					// will advance the segment
-					// appropriately.
-					if runRange.End != r.End {
-						seg = seg.NextSegment()
+				}
+				committedFR := memmap.FileRange{
+					Start: r.Start + uint64(i*usermem.PageSize),
+					End:   r.Start + uint64(j*usermem.PageSize),
+				}
+				// Advance seg to committedFR.Start.
+				for seg.Ok() && seg.End() < committedFR.Start {
+					seg = seg.NextSegment()
+				}
+				// Mark pages overlapping committedFR as committed.
+				for seg.Ok() && seg.Start() < committedFR.End {
+					if seg.ValuePtr().canCommit() {
+						seg = f.usage.Isolate(seg, committedFR)
+						seg.ValuePtr().knownCommitted = true
+						amount := seg.Range().Length()
+						usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
+						f.usageExpected += amount
+						changedAny = true
 					}
-					amount := runRange.Length()
-					usage.MemoryAccounting.Inc(amount, val.kind)
-					f.usageExpected += amount
-					changedAny = true
-					populatedRun = false
+					seg = seg.NextSegment()
 				}
+				// Continue scanning for committed pages.
+				i = j + 1
 			}
 
 			// Advance r.Start.
@@ -976,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(
 		if err != nil {
 			return err
 		}
+
+		// Continue with the first segment after r.End.
+		seg = f.usage.LowerBoundSegment(r.End)
 	}
 
 	return nil
@@ -1167,8 +1175,10 @@ func (f *MemoryFile) startEvictionsLocked() bool {
 	return startedAny
 }
 
-// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
-// locked.
+// Preconditions:
+// * info == f.evictable[user].
+// * !info.evicting.
+// * f.mu must be locked.
 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
 	info.evicting = true
 	f.evictionWG.Add(1)
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 209b28053..db7d55ef2 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -15,6 +15,7 @@ go_library(
         "//pkg/context",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
+        "//pkg/sentry/hostmm",
         "//pkg/sentry/memmap",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 57be41647..9dfac3eae 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -54,8 +54,9 @@ type Forwarder struct {
 // }
 // defer f.Disable()
 //
-// Preconditions: r must not be nil. f must not already be forwarding
-// interrupts to a Receiver.
+// Preconditions:
+// * r must not be nil.
+// * f must not already be forwarding interrupts to a Receiver.
 func (f *Forwarder) Enable(r Receiver) bool {
 	if r == nil {
 		panic("nil Receiver")
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 3970dd81d..8ce411102 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -9,12 +9,12 @@ go_library(
         "bluepill.go",
         "bluepill_allocator.go",
         "bluepill_amd64.go",
-        "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
         "bluepill_arm64.go",
         "bluepill_arm64.s",
         "bluepill_arm64_unsafe.go",
         "bluepill_fault.go",
+        "bluepill_impl_amd64.s",
         "bluepill_unsafe.go",
         "context.go",
         "filters_amd64.go",
@@ -56,6 +56,7 @@ go_library(
         "//pkg/sentry/time",
         "//pkg/sync",
         "//pkg/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
@@ -63,6 +64,7 @@ go_test(
     name = "kvm_test",
     srcs = [
         "kvm_amd64_test.go",
+        "kvm_arm64_test.go",
         "kvm_test.go",
         "virtual_map_test.go",
     ],
@@ -78,6 +80,15 @@ go_test(
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/platform/ring0",
         "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/time",
         "//pkg/usermem",
     ],
 )
+
+genrule(
+    name = "bluepill_impl_amd64",
+    srcs = ["bluepill_amd64.s"],
+    outs = ["bluepill_impl_amd64.s"],
+    cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+    tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 2bc34a435..025ea93b5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -19,11 +19,6 @@
 // This is guaranteed to be zero.
 #define VCPU_CPU 0x0
 
-// CPU_SELF is the self reference in ring0's percpu.
-//
-// This is guaranteed to be zero.
-#define CPU_SELF 0x0
-
 // Context offsets.
 //
 // Only limited use of the context is done in the assembly stub below, most is
@@ -44,7 +39,7 @@ begin:
 	LEAQ VCPU_CPU(AX), BX
 	BYTE CLI;
 check_vcpu:
-	MOVQ CPU_SELF(GS), CX
+	MOVQ ENTRY_CPU_SELF(GS), CX
 	CMPQ BX, CX
 	JE right_vCPU
 wrong_vcpu:
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 03a98512e..0a54dd30d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -83,5 +83,34 @@ func bluepillStopGuest(c *vCPU) {
 //
 //go:nosplit
 func bluepillReadyStopGuest(c *vCPU) bool {
-	return c.runData.readyForInterruptInjection != 0
+	if c.runData.readyForInterruptInjection == 0 {
+		return false
+	}
+
+	if c.runData.ifFlag == 0 {
+		// This is impossible if readyForInterruptInjection is 1.
+		throw("interrupts are disabled")
+	}
+
+	// Disable interrupts if we are in the kernel space.
+	//
+	// When the Sentry switches into the kernel mode, it disables
+	// interrupts. But when goruntime switches on a goroutine which has
+	// been saved in the host mode, it restores flags and this enables
+	// interrupts.  See the comment of UserFlagsSet for more details.
+	uregs := userRegs{}
+	err := c.getUserRegisters(&uregs)
+	if err != 0 {
+		throw("failed to get user registers")
+	}
+
+	if ring0.IsKernelFlags(uregs.RFLAGS) {
+		uregs.RFLAGS &^= ring0.KernelFlagsClear
+		err = c.setUserRegisters(&uregs)
+		if err != 0 {
+			throw("failed to set user registers")
+		}
+		return false
+	}
+	return true
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index ed5ae03d3..58f3d6fdd 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -39,6 +39,16 @@ var (
 	}
 )
 
+// getTLS returns the value of TPIDR_EL0 register.
+//
+//go:nosplit
+func getTLS() (value uint64)
+
+// setTLS writes the TPIDR_EL0 value.
+//
+//go:nosplit
+func setTLS(value uint64)
+
 // bluepillArchEnter is called during bluepillEnter.
 //
 //go:nosplit
@@ -51,6 +61,8 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
 	regs.Pstate = context.Pstate
 	regs.Pstate &^= uint64(ring0.PsrFlagsClear)
 	regs.Pstate |= ring0.KernelFlagsSet
+	regs.TPIDR_EL0 = getTLS()
+
 	return
 }
 
@@ -65,6 +77,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 	context.Pstate = regs.Pstate
 	context.Pstate &^= uint64(ring0.PsrFlagsClear)
 	context.Pstate |= ring0.UserFlagsSet
+	setTLS(regs.TPIDR_EL0)
 
 	lazyVfp := c.GetLazyVFP()
 	if lazyVfp != 0 {
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index 04efa0147..09c7e88e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -32,6 +32,18 @@
 #define CONTEXT_PC  0x1B8
 #define CONTEXT_R0 0xB8
 
+// getTLS returns the value of TPIDR_EL0 register.
+TEXT ·getTLS(SB),NOSPLIT,$0-8
+	MRS TPIDR_EL0, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+// setTLS writes the TPIDR_EL0 value.
+TEXT ·setTLS(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	MSR R1, TPIDR_EL0
+	RET
+
 // See bluepill.go.
 TEXT ·bluepill(SB),NOSPLIT,$0
 begin:
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e34f46aeb..a182e4f22 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -98,6 +98,10 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi
 	}
 	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags)
 	if errno == 0 {
+		// Store the physical address in the slot. This is used to
+		// avoid calls to handleBluepillFault in the future (see
+		// machine.mapPhysical).
+		atomic.StoreUintptr(&m.usedSlots[slot], physical)
 		// Successfully added region; we can increment nextSlot and
 		// allow another set to proceed here.
 		atomic.StoreUint32(&m.nextSlot, slot+1)
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index bf357de1a..eb05950cd 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -62,6 +62,9 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 //
 //go:nosplit
 func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+	// Increment our counter.
+	atomic.AddUint64(&c.guestExits, 1)
+
 	// Copy out registers.
 	bluepillArchExit(c, bluepillArchContext(context))
 
@@ -89,9 +92,6 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
-	// Increment the number of switches.
-	atomic.AddUint32(&c.switches, 1)
-
 	// Mark this as guest mode.
 	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
 	case vCPUUser: // Expected case.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index 6e6b76416..17268d127 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,6 +15,8 @@
 package kvm
 
 import (
+	"sync/atomic"
+
 	pkgcontext "gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -75,6 +77,9 @@ func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac a
 	// Clear the address space.
 	cpu.active.set(nil)
 
+	// Increment the number of user exits.
+	atomic.AddUint64(&cpu.userExits, 1)
+
 	// Release resources.
 	c.machine.Put(cpu)
 
diff --git a/pkg/sentry/platform/kvm/filters_amd64.go b/pkg/sentry/platform/kvm/filters_amd64.go
index 7d949f1dd..d3d216aa5 100644
--- a/pkg/sentry/platform/kvm/filters_amd64.go
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
@@ -17,14 +17,23 @@ package kvm
 import (
 	"syscall"
 
+	"golang.org/x/sys/unix"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 )
 
 // SyscallFilters returns syscalls made exclusively by the KVM platform.
 func (*KVM) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_ARCH_PRCTL:      {},
-		syscall.SYS_IOCTL:           {},
+		syscall.SYS_ARCH_PRCTL: {},
+		syscall.SYS_IOCTL:      {},
+		unix.SYS_MEMBARRIER: []seccomp.Rule{
+			{
+				seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+				seccomp.EqualTo(0),
+			},
+		},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
 		syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
index 9245d07c2..21abc2a3d 100644
--- a/pkg/sentry/platform/kvm/filters_arm64.go
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -17,13 +17,22 @@ package kvm
 import (
 	"syscall"
 
+	"golang.org/x/sys/unix"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 )
 
 // SyscallFilters returns syscalls made exclusively by the KVM platform.
 func (*KVM) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		syscall.SYS_IOCTL:           {},
+		syscall.SYS_IOCTL: {},
+		unix.SYS_MEMBARRIER: []seccomp.Rule{
+			{
+				seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED),
+				seccomp.EqualTo(0),
+			},
+		},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
 		syscall.SYS_RT_SIGTIMEDWAIT: {},
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ae813e24e..dd45ad10b 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -63,6 +63,9 @@ type runData struct {
 type KVM struct {
 	platform.NoCPUPreemptionDetection
 
+	// KVM never changes mm_structs.
+	platform.UseHostProcessMemoryBarrier
+
 	// machine is the backing VM.
 	machine *machine
 }
@@ -156,15 +159,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
 func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
 	// Allocate page tables and install system mappings.
 	pageTables := pagetables.New(newAllocator())
-	applyPhysicalRegions(func(pr physicalRegion) bool {
-		// Map the kernel in the upper half.
-		pageTables.Map(
-			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
-			pr.length,
-			pagetables.MapOpts{AccessType: usermem.AnyAccess},
-			pr.physical)
-		return true // Keep iterating.
-	})
+	k.machine.mapUpperHalf(pageTables)
 
 	// Return the new address space.
 	return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go
new file mode 100644
index 000000000..0e3d84d95
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go
@@ -0,0 +1,31 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
+)
+
+func TestKernelTLS(t *testing.T) {
+	bluepillTest(t, func(c *vCPU) {
+		if !testutil.TLSWorks() {
+			t.Errorf("tls does not work, and it should!")
+		}
+	})
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 3bf918446..6abaa21c4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -26,12 +26,16 @@ const (
 	_KVM_RUN                    = 0xae80
 	_KVM_NMI                    = 0xae9a
 	_KVM_CHECK_EXTENSION        = 0xae03
+	_KVM_GET_TSC_KHZ            = 0xaea3
+	_KVM_SET_TSC_KHZ            = 0xaea2
 	_KVM_INTERRUPT              = 0x4004ae86
 	_KVM_SET_MSRS               = 0x4008ae89
 	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
 	_KVM_SET_REGS               = 0x4090ae82
 	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_MSRS               = 0xc008ae88
 	_KVM_GET_REGS               = 0x8090ae81
+	_KVM_GET_SREGS              = 0x8138ae83
 	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
 	_KVM_SET_CPUID2             = 0x4008ae90
 	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
@@ -56,6 +60,7 @@ const (
 
 // KVM capability options.
 const (
+	_KVM_CAP_MAX_MEMSLOTS          = 0x0a
 	_KVM_CAP_MAX_VCPUS             = 0x42
 	_KVM_CAP_ARM_VM_IPA_SIZE       = 0xa5
 	_KVM_CAP_VCPU_EVENTS           = 0x29
@@ -64,6 +69,7 @@ const (
 
 // KVM limits.
 const (
+	_KVM_NR_MEMSLOTS      = 0x100
 	_KVM_NR_VCPUS         = 0xff
 	_KVM_NR_INTERRUPTS    = 0x100
 	_KVM_NR_CPUID_ENTRIES = 0x100
@@ -77,11 +83,14 @@ const (
 )
 
 // KVM hypercall list.
+//
 // Canonical list of hypercalls supported.
 const (
 	// On amd64, it uses 'HLT' to leave the guest.
+	//
 	// Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
-	// _KVM_HYPERCALL_VMEXIT is only used on Arm64 for now.
+	//
+	// _KVM_HYPERCALL_VMEXIT is only used on arm64 for now.
 	_KVM_HYPERCALL_VMEXIT int = iota
 	_KVM_HYPERCALL_MAX
 )
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 9a7be3655..84df0f878 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -101,13 +101,20 @@ const (
 
 // Arm64: Memory Attribute Indirection Register EL1.
 const (
-	_MT_DEVICE_nGnRnE = 0
-	_MT_DEVICE_nGnRE  = 1
-	_MT_DEVICE_GRE    = 2
-	_MT_NORMAL_NC     = 3
-	_MT_NORMAL        = 4
-	_MT_NORMAL_WT     = 5
-	_MT_EL1_INIT      = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+	_MT_DEVICE_nGnRnE      = 0
+	_MT_DEVICE_nGnRE       = 1
+	_MT_DEVICE_GRE         = 2
+	_MT_NORMAL_NC          = 3
+	_MT_NORMAL             = 4
+	_MT_NORMAL_WT          = 5
+	_MT_ATTR_DEVICE_nGnRnE = 0x00
+	_MT_ATTR_DEVICE_nGnRE  = 0x04
+	_MT_ATTR_DEVICE_GRE    = 0x0c
+	_MT_ATTR_NORMAL_NC     = 0x44
+	_MT_ATTR_NORMAL_WT     = 0xbb
+	_MT_ATTR_NORMAL        = 0xff
+	_MT_ATTR_MASK          = 0xff
+	_MT_EL1_INIT           = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8))
 )
 
 const (
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 45b3180f1..e58acc071 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -411,9 +412,9 @@ func TestWrongVCPU(t *testing.T) {
 			// Basic test, one then the other.
 			bluepill(c1)
 			bluepill(c2)
-			if c2.switches == 0 {
+			if c2.guestExits == 0 {
 				// Don't allow the test to proceed if this fails.
-				t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+				t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
 
 			// Alternate vCPUs; we expect to need to trigger the
@@ -422,11 +423,11 @@ func TestWrongVCPU(t *testing.T) {
 				bluepill(c1)
 				bluepill(c2)
 			}
-			if count := c1.switches; count < 90 {
-				t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			if count := c1.guestExits; count < 90 {
+				t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
-			if count := c2.switches; count < 90 {
-				t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			if count := c2.guestExits; count < 90 {
+				t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2)
 			}
 			return false
 		})
@@ -442,6 +443,22 @@ func TestWrongVCPU(t *testing.T) {
 	})
 }
 
+func TestRdtsc(t *testing.T) {
+	var i int // Iteration count.
+	kvmTest(t, nil, func(c *vCPU) bool {
+		start := ktime.Rdtsc()
+		bluepill(c)
+		guest := ktime.Rdtsc()
+		redpill()
+		end := ktime.Rdtsc()
+		if start > guest || guest > end {
+			t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end)
+		}
+		i++
+		return i < 100
+	})
+}
+
 func BenchmarkApplicationSyscall(b *testing.B) {
 	var (
 		i int // Iteration includes machine.Get() / machine.Put().
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 6c54712d1..61ed24d01 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -43,9 +43,6 @@ type machine struct {
 	// kernel is the set of global structures.
 	kernel ring0.Kernel
 
-	// mappingCache is used for mapPhysical.
-	mappingCache sync.Map
-
 	// mu protects vCPUs.
 	mu sync.RWMutex
 
@@ -63,6 +60,12 @@ type machine struct {
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
 
+	// maxSlots is the maximum number of memory slots supported by the machine.
+	maxSlots int
+
+	// usedSlots is the set of used physical addresses (sorted).
+	usedSlots []uintptr
+
 	// nextID is the next vCPU ID.
 	nextID uint32
 }
@@ -100,8 +103,11 @@ type vCPU struct {
 	// tid is the last set tid.
 	tid uint64
 
-	// switches is a count of world switches (informational only).
-	switches uint32
+	// userExits is the count of user exits.
+	userExits uint64
+
+	// guestExits is the count of guest to host world switches.
+	guestExits uint64
 
 	// faults is a count of world faults (informational only).
 	faults uint32
@@ -124,6 +130,7 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
+	// dieState holds state related to vCPU death.
 	dieState dieState
 }
 
@@ -152,7 +159,7 @@ func (m *machine) newVCPU() *vCPU {
 		fd:      int(fd),
 		machine: m,
 	}
-	c.CPU.Init(&m.kernel, c)
+	c.CPU.Init(&m.kernel, c.id, c)
 	m.vCPUsByID[c.id] = c
 
 	// Ensure the signal mask is correct.
@@ -180,10 +187,8 @@ func newMachine(vm int) (*machine, error) {
 	// Create the machine.
 	m := &machine{fd: vm}
 	m.available.L = &m.mu
-	m.kernel.Init(ring0.KernelOpts{
-		PageTables: pagetables.New(newAllocator()),
-	})
 
+	// Pull the maximum vCPUs.
 	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
 		m.maxVCPUs = _KVM_NR_VCPUS
@@ -191,10 +196,21 @@ func newMachine(vm int) (*machine, error) {
 		m.maxVCPUs = int(maxVCPUs)
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
-
-	// Create the vCPUs map/slices.
 	m.vCPUsByTID = make(map[uint64]*vCPU)
 	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+	m.kernel.Init(ring0.KernelOpts{
+		PageTables: pagetables.New(newAllocator()),
+	}, m.maxVCPUs)
+
+	// Pull the maximum slots.
+	maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
+	if errno != 0 {
+		m.maxSlots = _KVM_NR_MEMSLOTS
+	} else {
+		m.maxSlots = int(maxSlots)
+	}
+	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
+	m.usedSlots = make([]uintptr, m.maxSlots)
 
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
@@ -207,15 +223,9 @@ func newMachine(vm int) (*machine, error) {
 			pagetables.MapOpts{AccessType: usermem.AnyAccess},
 			pr.physical)
 
-		// And keep everything in the upper half.
-		m.kernel.PageTables.Map(
-			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
-			pr.length,
-			pagetables.MapOpts{AccessType: usermem.AnyAccess},
-			pr.physical)
-
 		return true // Keep iterating.
 	})
+	m.mapUpperHalf(m.kernel.PageTables)
 
 	var physicalRegionsReadOnly []physicalRegion
 	var physicalRegionsAvailable []physicalRegion
@@ -272,6 +282,20 @@ func newMachine(vm int) (*machine, error) {
 	return m, nil
 }
 
+// hasSlot returns true iff the given address is mapped.
+//
+// This must be done via a linear scan.
+//
+//go:nosplit
+func (m *machine) hasSlot(physical uintptr) bool {
+	for i := 0; i < len(m.usedSlots); i++ {
+		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
+			return true
+		}
+	}
+	return false
+}
+
 // mapPhysical checks for the mapping of a physical range, and installs one if
 // not available. This attempts to be efficient for calls in the hot path.
 //
@@ -286,8 +310,8 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
 			panic("mapPhysical on unknown physical address")
 		}
 
-		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
-			// Not present in the cache; requires setting the slot.
+		// Is this already mapped? Check the usedSlots.
+		if !m.hasSlot(physicalStart) {
 			if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok {
 				panic("handleBluepillFault failed")
 			}
@@ -339,6 +363,11 @@ func (m *machine) Destroy() {
 // Get gets an available vCPU.
 //
 // This will return with the OS thread locked.
+//
+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
+// to the vCPU in which the OS thread TID is running. So if Get() returns with
+// the corrent context in guest, the vCPU of it must be the same as what
+// Get() returns.
 func (m *machine) Get() *vCPU {
 	m.mu.RLock()
 	runtime.LockOSThread()
@@ -443,6 +472,19 @@ func (m *machine) newDirtySet() *dirtySet {
 	}
 }
 
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUsByID {
+		if c != nil && c.PCIDs != nil {
+			c.PCIDs.Drop(pt)
+		}
+	}
+}
+
 // lock marks the vCPU as in user mode.
 //
 // This should only be called directly when known to be safe, i.e. when
@@ -502,6 +544,8 @@ var pid = syscall.Getpid()
 //
 // This effectively unwinds the state machine.
 func (c *vCPU) bounce(forceGuestExit bool) {
+	origGuestExits := atomic.LoadUint64(&c.guestExits)
+	origUserExits := atomic.LoadUint64(&c.userExits)
 	for {
 		switch state := atomic.LoadUint32(&c.state); state {
 		case vCPUReady, vCPUWaiter:
@@ -557,6 +601,14 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 			// Should not happen: the above is exhaustive.
 			panic("invalid state")
 		}
+
+		// Check if we've missed the state transition, but
+		// we can safely return at this point in time.
+		newGuestExits := atomic.LoadUint64(&c.guestExits)
+		newUserExits := atomic.LoadUint64(&c.userExits)
+		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
+			return
+		}
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index acc823ba6..c67127d95 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -18,14 +18,17 @@ package kvm
 
 import (
 	"fmt"
+	"math/big"
 	"reflect"
 	"runtime/debug"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -84,19 +87,6 @@ const (
 	poolPCIDs = 8
 )
 
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	// Clear from all PCIDs.
-	for _, c := range m.vCPUsByID {
-		if c != nil && c.PCIDs != nil {
-			c.PCIDs.Drop(pt)
-		}
-	}
-}
-
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
@@ -144,6 +134,7 @@ func (c *vCPU) initArchState() error {
 	// Set the entrypoint for the kernel.
 	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
 	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	kernelUserRegs.RSP = c.StackTop()
 	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
 
 	// Set the system registers.
@@ -152,8 +143,8 @@ func (c *vCPU) initArchState() error {
 	}
 
 	// Set the user registers.
-	if err := c.setUserRegisters(&kernelUserRegs); err != nil {
-		return err
+	if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
 	}
 
 	// Allocate some floating point state save area for the local vCPU.
@@ -166,6 +157,133 @@ func (c *vCPU) initArchState() error {
 	return c.setSystemTime()
 }
 
+// bitsForScaling returns the bits available for storing the fraction component
+// of the TSC scaling ratio. This allows us to replicate the (bad) math done by
+// the kernel below in scaledTSC, and ensure we can compute an exact zero
+// offset in setSystemTime.
+//
+// These constants correspond to kvm_tsc_scaling_ratio_frac_bits.
+var bitsForScaling = func() int64 {
+	fs := cpuid.HostFeatureSet()
+	if fs.Intel() {
+		return 48 // See vmx.c (kvm sources).
+	} else if fs.AMD() {
+		return 32 // See svm.c (svm sources).
+	} else {
+		return 63 // Unknown: theoretical maximum.
+	}
+}()
+
+// scaledTSC returns the host TSC scaled by the given frequency.
+//
+// This assumes a current frequency of 1. We require only the unitless ratio of
+// rawFreq to some current frequency. See setSystemTime for context.
+//
+// The kernel math guarantees that all bits of the multiplication and division
+// will be correctly preserved and applied. However, it is not possible to
+// actually store the ratio correctly.  So we need to use the same schema in
+// order to calculate the scaled frequency and get the same result.
+//
+// We can assume that the current frequency is (1), so we are calculating a
+// strict inverse of this value. This simplifies this function considerably.
+//
+// Roughly, the returned value "scaledTSC" will have:
+// 	scaledTSC/hostTSC == 1/rawFreq
+//
+//go:nosplit
+func scaledTSC(rawFreq uintptr) int64 {
+	scale := int64(1 << bitsForScaling)
+	ratio := big.NewInt(scale / int64(rawFreq))
+	ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc())))
+	ratio.Div(ratio, big.NewInt(scale))
+	return ratio.Int64()
+}
+
+// setSystemTime sets the vCPU to the system time.
+func (c *vCPU) setSystemTime() error {
+	// First, scale down the clock frequency to the lowest value allowed by
+	// the API itself.  How low we can go depends on the underlying
+	// hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD.
+	// Even the lower bound here will take a 4GHz frequency down to 1Hz,
+	// meaning that everything should be able to handle a Khz setting of 1
+	// with bits to spare.
+	//
+	// Note that reducing the clock does not typically require special
+	// capabilities as it is emulated in KVM. We don't actually use this
+	// capability, but it means that this method should be robust to
+	// different hardware configurations.
+	rawFreq, err := c.getTSCFreq()
+	if err != nil {
+		return c.setSystemTimeLegacy()
+	}
+	if err := c.setTSCFreq(1); err != nil {
+		return c.setSystemTimeLegacy()
+	}
+
+	// Always restore the original frequency.
+	defer func() {
+		if err := c.setTSCFreq(rawFreq); err != nil {
+			panic(err.Error())
+		}
+	}()
+
+	// Attempt to set the system time in this compressed world. The
+	// calculation for offset normally looks like:
+	//
+	//	offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc());
+	//
+	// So as long as the kvm_scale_tsc component is constant before and
+	// after the call to set the TSC value (and it is passes as the
+	// target_tsc), we will compute an offset value of zero.
+	//
+	// This is effectively cheating to make our "setSystemTime" call so
+	// unbelievably, incredibly fast that we do it "instantly" and all the
+	// calculations result in an offset of zero.
+	lastTSC := scaledTSC(rawFreq)
+	for {
+		if err := c.setTSC(uint64(lastTSC)); err != nil {
+			return err
+		}
+		nextTSC := scaledTSC(rawFreq)
+		if lastTSC == nextTSC {
+			return nil
+		}
+		lastTSC = nextTSC // Try again.
+	}
+}
+
+// setSystemTimeLegacy calibrates and sets an approximate system time.
+func (c *vCPU) setSystemTimeLegacy() error {
+	const minIterations = 10
+	minimum := uint64(0)
+	for iter := 0; ; iter++ {
+		// Try to set the TSC to an estimate of where it will be
+		// on the host during a "fast" system call iteration.
+		start := uint64(ktime.Rdtsc())
+		if err := c.setTSC(start + (minimum / 2)); err != nil {
+			return err
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result.
+		end := uint64(ktime.Rdtsc())
+		if end < start {
+			continue // Totally bogus: unstable TSC?
+		}
+		current := end - start
+		if current < minimum || iter == 0 {
+			minimum = current // Set our new minimum.
+		}
+		// Is this past minIterations and within ~10% of minimum?
+		upperThreshold := (((minimum << 3) + minimum) >> 3)
+		if iter >= minIterations && current <= upperThreshold {
+			return nil
+		}
+	}
+}
+
 // nonCanonical generates a canonical address return.
 //
 //go:nosplit
@@ -345,3 +463,41 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 	return physicalRegions
 }
+
+var execRegions = func() (regions []region) {
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
+			return
+		}
+		if vr.accessType.Execute {
+			regions = append(regions, vr.region)
+		}
+	})
+	return
+}()
+
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+	for _, r := range execRegions {
+		physical, length, ok := translateToPhysical(r.virtual)
+		if !ok || length < r.length {
+			panic("impossilbe translation")
+		}
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|r.virtual),
+			r.length,
+			pagetables.MapOpts{AccessType: usermem.Execute},
+			physical)
+	}
+	for start, end := range m.kernel.EntryRegions() {
+		regionLen := end - start
+		physical, length, ok := translateToPhysical(start)
+		if !ok || length < regionLen {
+			panic("impossible translation")
+		}
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|start),
+			regionLen,
+			pagetables.MapOpts{AccessType: usermem.ReadWrite},
+			physical)
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 290f035dd..b430f92c6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -23,7 +23,6 @@ import (
 	"unsafe"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/time"
 )
 
 // loadSegments copies the current segments.
@@ -61,91 +60,63 @@ func (c *vCPU) setCPUID() error {
 	return nil
 }
 
-// setSystemTime sets the TSC for the vCPU.
+// getTSCFreq gets the TSC frequency.
 //
-// This has to make the call many times in order to minimize the intrinsic
-// error in the offset. Unfortunately KVM does not expose a relative offset via
-// the API, so this is an approximation. We do this via an iterative algorithm.
-// This has the advantage that it can generally deal with highly variable
-// system call times and should converge on the correct offset.
-func (c *vCPU) setSystemTime() error {
-	const (
-		_MSR_IA32_TSC  = 0x00000010
-		calibrateTries = 10
-	)
-	registers := modelControlRegisters{
-		nmsrs: 1,
-	}
-	registers.entries[0] = modelControlRegister{
-		index: _MSR_IA32_TSC,
+// If mustSucceed is true, then this function panics on error.
+func (c *vCPU) getTSCFreq() (uintptr, error) {
+	rawFreq, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_TSC_KHZ,
+		0 /* ignored */)
+	if errno != 0 {
+		return 0, errno
 	}
-	target := uint64(^uint32(0))
-	for done := 0; done < calibrateTries; {
-		start := uint64(time.Rdtsc())
-		registers.entries[0].data = start + target
-		if _, _, errno := syscall.RawSyscall(
-			syscall.SYS_IOCTL,
-			uintptr(c.fd),
-			_KVM_SET_MSRS,
-			uintptr(unsafe.Pointer(&registers))); errno != 0 {
-			return fmt.Errorf("error setting system time: %v", errno)
-		}
-		// See if this is our new minimum call time. Note that this
-		// serves two functions: one, we make sure that we are
-		// accurately predicting the offset we need to set. Second, we
-		// don't want to do the final set on a slow call, which could
-		// produce a really bad result. So we only count attempts
-		// within +/- 6.25% of our minimum as an attempt.
-		end := uint64(time.Rdtsc())
-		if end < start {
-			continue // Totally bogus.
-		}
-		half := (end - start) / 2
-		if half < target {
-			target = half
-		}
-		if (half - target) < target/8 {
-			done++
-		}
+	return rawFreq, nil
+}
+
+// setTSCFreq sets the TSC frequency.
+func (c *vCPU) setTSCFreq(freq uintptr) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_TSC_KHZ,
+		freq /* khz */); errno != 0 {
+		return fmt.Errorf("error setting TSC frequency: %v", errno)
 	}
 	return nil
 }
 
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
+// setTSC sets the TSC value.
+func (c *vCPU) setTSC(value uint64) error {
+	const _MSR_IA32_TSC = 0x00000010
+	registers := modelControlRegisters{
+		nmsrs: 1,
 	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	registers.entries[0].index = _MSR_IA32_TSC
+	registers.entries[0].data = value
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
+		_KVM_SET_MSRS,
+		uintptr(unsafe.Pointer(&registers))); errno != 0 {
+		return fmt.Errorf("error setting tsc: %v", errno)
 	}
 	return nil
 }
 
 // setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+//
+//go:nosplit
+func (c *vCPU) setUserRegisters(uregs *userRegs) syscall.Errno {
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
 		_KVM_SET_REGS,
 		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return fmt.Errorf("error setting user registers: %v", errno)
+		return errno
 	}
-	return nil
+	return 0
 }
 
 // getUserRegisters reloads user registers in the vCPU.
@@ -175,3 +146,17 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
 	}
 	return nil
 }
+
+// getSystemRegisters sets system registers.
+//
+//go:nosplit
+func (c *vCPU) getSystemRegisters(sregs *systemRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 9db171af9..54837f20c 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -19,6 +19,7 @@ package kvm
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -48,6 +49,18 @@ const (
 	poolPCIDs = 8
 )
 
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		pageTable.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		return true // Keep iterating.
+	})
+}
+
 // Get all read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 	var rdonlyRegions []region
@@ -100,19 +113,6 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 	return phyRegions
 }
 
-// dropPageTables drops cached page table entries.
-func (m *machine) dropPageTables(pt *pagetables.PageTables) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	// Clear from all PCIDs.
-	for _, c := range m.vCPUsByID {
-		if c.PCIDs != nil {
-			c.PCIDs.Drop(pt)
-		}
-	}
-}
-
 // nonCanonical generates a canonical address return.
 //
 //go:nosplit
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 905712076..84992c06d 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -79,7 +79,7 @@ func (c *vCPU) initArchState() error {
 	}
 
 	// tcr_el1
-	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS | _TCR_A1
 	reg.id = _KVM_ARM64_REGS_TCR_EL1
 	if err := c.setOneRegister(&reg); err != nil {
 		return err
@@ -103,7 +103,7 @@ func (c *vCPU) initArchState() error {
 	c.SetTtbr0Kvm(uintptr(data))
 
 	// ttbr1_el1
-	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 1)
 
 	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
 	if err := c.setOneRegister(&reg); err != nil {
@@ -191,42 +191,6 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
 	return nil
 }
 
-// setCPUID sets the CPUID to be used by the guest.
-func (c *vCPU) setCPUID() error {
-	return nil
-}
-
-// setSystemTime sets the TSC for the vCPU.
-func (c *vCPU) setSystemTime() error {
-	return nil
-}
-
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
-	// The layout of this structure implies that it will not necessarily be
-	// the same layout chosen by the Go compiler. It gets fudged here.
-	var data struct {
-		length uint32
-		mask1  uint32
-		mask2  uint32
-		_      uint32
-	}
-	data.length = 8 // Fixed sigset size.
-	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
-	data.mask2 = ^uint32(bounceSignalMask >> 32)
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SIGNAL_MASK,
-		uintptr(unsafe.Pointer(&data))); errno != 0 {
-		return fmt.Errorf("error setting signal mask: %v", errno)
-	}
-
-	return nil
-}
-
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
@@ -271,8 +235,9 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 		return c.fault(int32(syscall.SIGSEGV), info)
 	case ring0.Vector(bounce): // ring0.VirtualizationException
 		return usermem.NoAccess, platform.ErrContextInterrupt
-	case ring0.El0Sync_undef,
-		ring0.El1Sync_undef:
+	case ring0.El0Sync_undef:
+		return c.fault(int32(syscall.SIGILL), info)
+	case ring0.El1Sync_undef:
 		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGILL),
 			Code:  1, // ILL_ILLOPC (illegal opcode).
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 9f86f6a7a..1d6ca245a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -143,3 +143,29 @@ func (c *vCPU) waitUntilNot(state uint32) {
 		panic("futex wait error")
 	}
 }
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index 4dad877ba..c5235ca9d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -23,6 +23,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
+// TLSWorks is a tls test.
+//
+// It returns true or false.
+func TLSWorks() bool
+
 // SetTestTarget sets the rip appropriately.
 func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 6caf7282d..7348c29a5 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,6 +50,22 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
 start:
 	B start
 
+TEXT ·TLSWorks(SB),NOSPLIT,$0-8
+        NO_LOCAL_POINTERS
+        MOVD $0x6789, R5
+        MSR R5, TPIDR_EL0
+        MOVD $SYS_GETPID, R8 // getpid
+        SVC
+        MRS TPIDR_EL0, R6
+        CMP R5, R6
+        BNE isNaN
+        MOVD $1, R0
+        MOVD R0, ret+0(FP)
+        RET
+isNaN:
+        MOVD $0, ret+0(FP)
+        RET
+
 TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
 	NO_LOCAL_POINTERS
 	// gc will touch fpsimd, so we should test it.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index c8897d34f..4dcdbf8a7 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -34,7 +34,7 @@ type virtualRegion struct {
 }
 
 // mapsLine matches a single line from /proc/PID/maps.
-var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
 
 // excludeRegion returns true if these regions should be excluded from the
 // physical map. Virtual regions need to be excluded if get_user_pages will
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index ba031516a..dcfe839a7 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/hostmm"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -52,6 +53,10 @@ type Platform interface {
 	// can reliably return ErrContextCPUPreempted.
 	DetectsCPUPreemption() bool
 
+	// HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method
+	// is supported.
+	HaveGlobalMemoryBarrier() bool
+
 	// MapUnit returns the alignment used for optional mappings into this
 	// platform's AddressSpaces. Higher values indicate lower per-page costs
 	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
@@ -97,6 +102,15 @@ type Platform interface {
 	// called.
 	PreemptAllCPUs() error
 
+	// GlobalMemoryBarrier blocks until all threads running application code
+	// (via Context.Switch) and all task goroutines "have passed through a
+	// state where all memory accesses to user-space addresses match program
+	// order between entry to and return from [GlobalMemoryBarrier]", as for
+	// membarrier(2).
+	//
+	// Preconditions: HaveGlobalMemoryBarrier() == true.
+	GlobalMemoryBarrier() error
+
 	// SyscallFilters returns syscalls made exclusively by this platform.
 	SyscallFilters() seccomp.SyscallRules
 }
@@ -115,6 +129,43 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
 	panic("This platform does not support CPU preemption detection")
 }
 
+// UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking equivalent functionality on the
+// host.
+type UseHostGlobalMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+	return hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error {
+	return hostmm.GlobalMemoryBarrier()
+}
+
+// UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and
+// Platform.GlobalMemoryBarrier by invoking a process-local memory barrier.
+// This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for
+// platforms for which application code executes while using the sentry's
+// mm_struct.
+type UseHostProcessMemoryBarrier struct{}
+
+// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool {
+	// Fall back to a global memory barrier if a process-local one isn't
+	// available.
+	return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier()
+}
+
+// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier.
+func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error {
+	if hostmm.HaveProcessMemoryBarrier() {
+		return hostmm.ProcessMemoryBarrier()
+	}
+	return hostmm.GlobalMemoryBarrier()
+}
+
 // MemoryManager represents an abstraction above the platform address space
 // which manages memory mappings and their contents.
 type MemoryManager interface {
@@ -245,14 +296,19 @@ type AddressSpace interface {
 	// physical memory) to the mapping. The precommit flag is advisory and
 	// implementations may choose to ignore it.
 	//
-	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
-	// at.Any() == true. At least one reference must be held on all pages in
-	// fr, and must continue to be held as long as pages are mapped.
+	// Preconditions:
+	// * addr and fr must be page-aligned.
+	// * fr.Length() > 0.
+	// * at.Any() == true.
+	// * At least one reference must be held on all pages in fr, and must
+	//   continue to be held as long as pages are mapped.
 	MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error
 
 	// Unmap unmaps the given range.
 	//
-	// Preconditions: addr is page-aligned. length > 0.
+	// Preconditions:
+	// * addr is page-aligned.
+	// * length > 0.
 	Unmap(addr usermem.Addr, length uint64)
 
 	// Release releases this address space. After releasing, a new AddressSpace
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e04165fbf..fc43cc3c0 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/safecopy",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
-        "//pkg/sentry/hostcpu",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go
index 1e07cfd0d..b0970e356 100644
--- a/pkg/sentry/platform/ptrace/filters.go
+++ b/pkg/sentry/platform/ptrace/filters.go
@@ -24,10 +24,9 @@ import (
 // SyscallFilters returns syscalls made exclusively by the ptrace platform.
 func (*PTrace) SyscallFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
-		unix.SYS_GETCPU:            {},
-		unix.SYS_SCHED_SETAFFINITY: {},
-		syscall.SYS_PTRACE:         {},
-		syscall.SYS_TGKILL:         {},
-		syscall.SYS_WAIT4:          {},
+		unix.SYS_GETCPU:    {},
+		syscall.SYS_PTRACE: {},
+		syscall.SYS_TGKILL: {},
+		syscall.SYS_WAIT4:  {},
 	}
 }
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index b52d0fbd8..f56aa3b79 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -192,6 +192,7 @@ func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {}
 type PTrace struct {
 	platform.MMapMinAddr
 	platform.NoCPUPreemptionDetection
+	platform.UseHostGlobalMemoryBarrier
 }
 
 // New returns a new ptrace-based implementation of the platform interface.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index e1d54d8a2..812ab80ef 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -518,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	}
 	defer c.interrupt.Disable()
 
-	// Ensure that the CPU set is bound appropriately; this makes the
-	// emulation below several times faster, presumably by avoiding
-	// interprocessor wakeups and by simplifying the schedule.
-	t.bind()
-
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 84b699f0d..020bbda79 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -201,7 +201,7 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi
 			seccomp.RuleSet{
 				Rules: seccomp.SyscallRules{
 					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+						{seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)},
 					},
 				},
 				Action: linux.SECCOMP_RET_ALLOW,
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2ce528601..8548853da 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -80,9 +80,9 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_CLONE: []seccomp.Rule{
 					// Allow creation of new subprocesses (used by the master).
-					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					{seccomp.EqualTo(syscall.CLONE_FILES | syscall.SIGKILL)},
 					// Allow creation of new threads within a single address space (used by addresss spaces).
-					{seccomp.AllowValue(
+					{seccomp.EqualTo(
 						syscall.CLONE_FILES |
 							syscall.CLONE_FS |
 							syscall.CLONE_SIGHAND |
@@ -97,14 +97,14 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 
 				// For the stub prctl dance (all).
 				syscall.SYS_PRCTL: []seccomp.Rule{
-					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+					{seccomp.EqualTo(syscall.PR_SET_PDEATHSIG), seccomp.EqualTo(syscall.SIGKILL)},
 				},
 				syscall.SYS_GETPPID: {},
 
 				// For the stub to stop itself (all).
 				syscall.SYS_GETPID: {},
 				syscall.SYS_KILL: []seccomp.Rule{
-					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+					{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SIGSTOP)},
 				},
 
 				// Injected to support the address space operations.
@@ -115,7 +115,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 		})
 	}
 	rules = appendArchSeccompRules(rules, defaultAction)
-	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 245b20722..533e45497 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,29 +18,12 @@
 package ptrace
 
 import (
-	"sync/atomic"
 	"syscall"
 	"unsafe"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
-	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
-// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
-// just records the number of CPUs available in the scheduler affinity set at
-// startup. This may a) change over time and b) gives a number far lower than
-// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
-// use a pool to store large masks that we can reuse during bind.
-var maskPool = sync.Pool{
-	New: func() interface{} {
-		const maxCPUs = 1024 // Not a hard limit; see below.
-		return make([]uintptr, maxCPUs/64)
-	},
-}
-
 // unmaskAllSignals unmasks all signals on the current thread.
 //
 //go:nosplit
@@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno {
 	_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
 	return errno
 }
-
-// setCPU sets the CPU affinity.
-func (t *thread) setCPU(cpu uint32) error {
-	mask := maskPool.Get().([]uintptr)
-	n := int(cpu / 64)
-	v := uintptr(1 << uintptr(cpu%64))
-	if n >= len(mask) {
-		// See maskPool note above. We've actually exceeded the number
-		// of available cores. Grow the mask and return it.
-		mask = make([]uintptr, n+1)
-	}
-	mask[n] |= v
-	if _, _, errno := syscall.RawSyscall(
-		unix.SYS_SCHED_SETAFFINITY,
-		uintptr(t.tid),
-		uintptr(len(mask)*8),
-		uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
-		return errno
-	}
-	mask[n] &^= v
-	maskPool.Put(mask)
-	return nil
-}
-
-// bind attempts to ensure that the thread is on the same CPU as the current
-// thread. This provides no guarantees as it is fundamentally a racy operation:
-// CPU sets may change and we may be rescheduled in the middle of this
-// operation. As a result, no failures are reported.
-//
-// Precondition: the current runtime thread should be locked.
-func (t *thread) bind() {
-	currentCPU := hostcpu.GetCPU()
-
-	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
-		// Set the affinity on the thread and save the CPU for next
-		// round; we don't expect CPUs to bounce around too frequently.
-		//
-		// (It's worth noting that we could move CPUs between this point
-		// and when the tracee finishes executing. But that would be
-		// roughly the status quo anyways -- we're just maximizing our
-		// chances of colocation, not guaranteeing it.)
-		t.setCPU(currentCPU)
-	}
-}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 0bee995e4..7ee20d89a 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 9c6c2cf5c..00899273e 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -76,15 +76,41 @@ type KernelOpts struct {
 type KernelArchState struct {
 	KernelOpts
 
+	// cpuEntries is array of kernelEntry for all cpus
+	cpuEntries []kernelEntry
+
 	// globalIDT is our set of interrupt gates.
-	globalIDT idt64
+	globalIDT *idt64
 }
 
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
+// kernelEntry contains minimal CPU-specific arch state
+// that can be mapped at the upper of the address space.
+// Malicious APP might steal info from it via CPU bugs.
+type kernelEntry struct {
 	// stack is the stack used for interrupts on this CPU.
 	stack [256]byte
 
+	// scratch space for temporary usage.
+	scratch0 uint64
+
+	// stackTop is the top of the stack.
+	stackTop uint64
+
+	// cpuSelf is back reference to CPU.
+	cpuSelf *CPU
+
+	// kernelCR3 is the cr3 used for sentry kernel.
+	kernelCR3 uintptr
+
+	// gdt is the CPU's descriptor table.
+	gdt descriptorTable
+
+	// tss is the CPU's task state.
+	tss TaskState64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
 	// errorCode is the error code from the last exception.
 	errorCode uintptr
 
@@ -97,11 +123,7 @@ type CPUArchState struct {
 	// exception.
 	errorType uintptr
 
-	// gdt is the CPU's descriptor table.
-	gdt descriptorTable
-
-	// tss is the CPU's task state.
-	tss TaskState64
+	*kernelEntry
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 0e2ab716c..508236e46 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -77,6 +77,9 @@ type CPUArchState struct {
 
 	// lazyVFP is the value of cpacr_el1.
 	lazyVFP uintptr
+
+	// appASID is the asid value of guest application.
+	appASID uintptr
 }
 
 // ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 7fa43c2f5..d87b1fd00 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -36,12 +36,15 @@ func sysenter()
 // This must be called prior to sysret/iret.
 func swapgs()
 
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
 // sysret returns to userspace from a system call.
 //
 // The return code is the vector that interrupted execution.
 //
 // See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *arch.Registers) Vector
+func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
 
 // "iret is the cadillac of CPL switching."
 //
@@ -50,7 +53,7 @@ func sysret(*CPU, *arch.Registers) Vector
 // iret is nearly identical to sysret, except an iret is used to fully restore
 // all user state. This must be called in cases where all registers need to be
 // restored.
-func iret(*CPU, *arch.Registers) Vector
+func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector
 
 // exception is the generic exception entry.
 //
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 02df38331..f59747df3 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -63,6 +63,15 @@
   MOVQ offset+PTRACE_RSI(reg), SI; \
   MOVQ offset+PTRACE_RDI(reg), DI;
 
+// WRITE_CR3() writes the given CR3 value.
+//
+// The code corresponds to:
+//
+//     mov %rax, %cr3
+//
+#define WRITE_CR3() \
+	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+
 // SWAP_GS swaps the kernel GS (CPU).
 #define SWAP_GS() \
 	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
@@ -75,15 +84,9 @@
 #define SYSRET64() \
 	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
 
-// LOAD_KERNEL_ADDRESS loads a kernel address.
-#define LOAD_KERNEL_ADDRESS(from, to) \
-	MOVQ from, to; \
-	ORQ ·KernelStartAddress(SB), to;
-
 // LOAD_KERNEL_STACK loads the kernel stack.
-#define LOAD_KERNEL_STACK(from) \
-	LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
-	LEAQ CPU_STACK_TOP(SP), SP;
+#define LOAD_KERNEL_STACK(entry) \
+	MOVQ ENTRY_STACK_TOP(entry), SP;
 
 // See kernel.go.
 TEXT ·Halt(SB),NOSPLIT,$0
@@ -95,58 +98,93 @@ TEXT ·swapgs(SB),NOSPLIT,$0
 	SWAP_GS()
 	RET
 
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	ORQ ·KernelStartAddress(SB), AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
 // See entry_amd64.go.
 TEXT ·sysret(SB),NOSPLIT,$0-24
-	// Save original state.
-	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
-	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	CALL ·jumpToKernel(SB)
+	// Save original state and stack. sysenter() or exception()
+	// from APP(gr3) will switch to this stack, set the return
+	// value (vector: 32(SP)) and then do RET, which will also
+	// automatically return to the lower half.
+	MOVQ cpu+0(FP), BX
+	MOVQ regs+8(FP), AX
+	MOVQ userCR3+16(FP), CX
 	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
 	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
 	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
 
+	// save SP AX userCR3 on the kernel stack.
+	MOVQ CPU_ENTRY(BX), BX
+	LOAD_KERNEL_STACK(BX)
+	PUSHQ PTRACE_RSP(AX)
+	PUSHQ PTRACE_RAX(AX)
+	PUSHQ CX
+
 	// Restore user register state.
 	REGISTERS_LOAD(AX, 0)
 	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
 	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
-	MOVQ PTRACE_RSP(AX), SP    // Restore the stack directly.
-	MOVQ PTRACE_RAX(AX), AX    // Restore AX (scratch).
+
+	// restore userCR3, AX, SP.
+	POPQ AX	                            // Get userCR3.
+	WRITE_CR3()                         // Switch to userCR3.
+	POPQ AX                             // Restore AX.
+	POPQ SP                             // Restore SP.
 	SYSRET64()
 
 // See entry_amd64.go.
 TEXT ·iret(SB),NOSPLIT,$0-24
-	// Save original state.
-	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
-	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	CALL ·jumpToKernel(SB)
+	// Save original state and stack. sysenter() or exception()
+	// from APP(gr3) will switch to this stack, set the return
+	// value (vector: 32(SP)) and then do RET, which will also
+	// automatically return to the lower half.
+	MOVQ cpu+0(FP), BX
+	MOVQ regs+8(FP), AX
+	MOVQ userCR3+16(FP), CX
 	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
 	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
 	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
 
 	// Build an IRET frame & restore state.
+	MOVQ CPU_ENTRY(BX), BX
 	LOAD_KERNEL_STACK(BX)
-	MOVQ PTRACE_SS(AX), BX;    PUSHQ BX
-	MOVQ PTRACE_RSP(AX), CX;   PUSHQ CX
-	MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
-	MOVQ PTRACE_CS(AX), DI;    PUSHQ DI
-	MOVQ PTRACE_RIP(AX), SI;   PUSHQ SI
-	REGISTERS_LOAD(AX, 0)   // Restore most registers.
-	MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+	PUSHQ PTRACE_SS(AX)
+	PUSHQ PTRACE_RSP(AX)
+	PUSHQ PTRACE_FLAGS(AX)
+	PUSHQ PTRACE_CS(AX)
+	PUSHQ PTRACE_RIP(AX)
+	PUSHQ PTRACE_RAX(AX)                // Save AX on kernel stack.
+	PUSHQ CX                            // Save userCR3 on kernel stack.
+	REGISTERS_LOAD(AX, 0)               // Restore most registers.
+	POPQ AX	                            // Get userCR3.
+	WRITE_CR3()                         // Switch to userCR3.
+	POPQ AX                             // Restore AX.
 	IRET()
 
 // See entry_amd64.go.
 TEXT ·resume(SB),NOSPLIT,$0
 	// See iret, above.
-	MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX;    PUSHQ BX
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX;   PUSHQ CX
-	MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
-	MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI;    PUSHQ DI
-	MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI;   PUSHQ SI
-	REGISTERS_LOAD(GS, CPU_REGISTERS)
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
+	PUSHQ CPU_REGISTERS+PTRACE_SS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_RSP(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_CS(AX)
+	PUSHQ CPU_REGISTERS+PTRACE_RIP(AX)
+	REGISTERS_LOAD(AX, CPU_REGISTERS)
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX
 	IRET()
 
 // See entry_amd64.go.
 TEXT ·Start(SB),NOSPLIT,$0
-	LOAD_KERNEL_STACK(AX) // Set the stack.
 	PUSHQ $0x0            // Previous frame pointer.
 	MOVQ SP, BP           // Set frame pointer.
 	PUSHQ AX              // First argument (CPU).
@@ -155,53 +193,60 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 // See entry_amd64.go.
 TEXT ·sysenter(SB),NOSPLIT,$0
-	// Interrupts are always disabled while we're executing in kernel mode
-	// and always enabled while executing in user mode. Therefore, we can
-	// reliably look at the flags in R11 to determine where this syscall
-	// was from.
-	TESTL $_RFLAGS_IF, R11
+	// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
+	// the kernel mode. See the comment of UserFlagsSet for more details.
+	TESTL $_RFLAGS_IOPL0, R11
 	JZ kernel
-
 user:
 	SWAP_GS()
-	XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
-	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	MOVQ AX, ENTRY_SCRATCH0(GS)            // Save user AX on scratch.
+	MOVQ ENTRY_KERNEL_CR3(GS), AX          // Get kernel cr3 on AX.
+	WRITE_CR3()                            // Switch to kernel cr3.
+
+	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX  // Get user regs.
 	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
-	MOVQ BX,  PTRACE_RAX(AX)               // Save everything else.
-	MOVQ BX,  PTRACE_ORIGRAX(AX)
 	MOVQ CX,  PTRACE_RIP(AX)
 	MOVQ R11, PTRACE_FLAGS(AX)
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
-	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
-	MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+	MOVQ SP,  PTRACE_RSP(AX)
+	MOVQ ENTRY_SCRATCH0(GS), CX            // Load saved user AX value.
+	MOVQ CX,  PTRACE_RAX(AX)               // Save everything else.
+	MOVQ CX,  PTRACE_ORIGRAX(AX)
+
+	MOVQ ENTRY_CPU_SELF(GS), AX            // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP  // Get stacks.
+	MOVQ $0, CPU_ERROR_CODE(AX)            // Clear error code.
+	MOVQ $1, CPU_ERROR_TYPE(AX)            // Set error type to user.
 
 	// Return to the kernel, where the frame is:
 	//
-	//	vector      (sp+24)
+	//	vector      (sp+32)
+	//	userCR3     (sp+24)
 	// 	regs        (sp+16)
 	// 	cpu         (sp+8)
 	// 	vcpu.Switch (sp+0)
 	//
-	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
-	MOVQ $Syscall, 24(SP)                 // Output vector.
+	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+	MOVQ $Syscall, 32(SP)                 // Output vector.
 	RET
 
 kernel:
 	// We can't restore the original stack, but we can access the registers
 	// in the CPU state directly. No need for temporary juggling.
-	MOVQ AX,  CPU_REGISTERS+PTRACE_ORIGRAX(GS)
-	MOVQ AX,  CPU_REGISTERS+PTRACE_RAX(GS)
-	REGISTERS_SAVE(GS, CPU_REGISTERS)
-	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(GS)
-	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
-	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(GS)
-	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
-	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ AX,  ENTRY_SCRATCH0(GS)
+	MOVQ ENTRY_CPU_SELF(GS), AX                 // Load vCPU.
+	REGISTERS_SAVE(AX, CPU_REGISTERS)
+	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(AX)
+	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX)
+	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(AX)
+	MOVQ ENTRY_SCRATCH0(GS), BX
+	MOVQ BX,  CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+	MOVQ BX,  CPU_REGISTERS+PTRACE_RAX(AX)
+	MOVQ $0,  CPU_ERROR_CODE(AX)                // Clear error code.
+	MOVQ $0,  CPU_ERROR_TYPE(AX)                // Set error type to kernel.
 
 	// Call the syscall trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_SELF(GS), AX   // Load vCPU.
 	PUSHQ AX                // First argument (vCPU).
 	CALL ·kernelSyscall(SB) // Call the trampoline.
 	POPQ AX                 // Pop vCPU.
@@ -230,16 +275,21 @@ TEXT ·exception(SB),NOSPLIT,$0
 	//	ERROR_CODE  (sp+8)
 	//	VECTOR      (sp+0)
 	//
-	TESTL $_RFLAGS_IF, 32(SP)
+	TESTL $_RFLAGS_IOPL0, 32(SP)
 	JZ kernel
 
 user:
 	SWAP_GS()
 	ADDQ $-8, SP                            // Adjust for flags.
 	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
-	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX  // Swap for user regs.
+	PUSHQ AX                                // Save user AX on stack.
+	MOVQ ENTRY_KERNEL_CR3(GS), AX           // Get kernel cr3 on AX.
+	WRITE_CR3()                             // Switch to kernel cr3.
+
+	MOVQ ENTRY_CPU_SELF(GS), AX             // Load vCPU.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX   // Get user regs.
 	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX   // Restore original AX.
+	POPQ BX                                 // Restore original AX.
 	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
 	MOVQ BX, PTRACE_ORIGRAX(AX)
 	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
@@ -249,34 +299,36 @@ user:
 	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
 
 	// Copy out and return.
+	MOVQ ENTRY_CPU_SELF(GS), AX           // Load vCPU.
 	MOVQ 0(SP), BX                        // Load vector.
 	MOVQ 8(SP), CX                        // Load error code.
-	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
-	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
-	MOVQ CX, CPU_ERROR_CODE(GS)           // Set error code.
-	MOVQ $1, CPU_ERROR_TYPE(GS)           // Set error type to user.
-	MOVQ BX, 24(SP)                       // Output vector.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version).
+	MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer.
+	MOVQ CX, CPU_ERROR_CODE(AX)           // Set error code.
+	MOVQ $1, CPU_ERROR_TYPE(AX)           // Set error type to user.
+	MOVQ BX, 32(SP)                       // Output vector.
 	RET
 
 kernel:
 	// As per above, we can save directly.
-	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
-	MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
-	REGISTERS_SAVE(GS, CPU_REGISTERS)
-	MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
-	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
-	MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+	PUSHQ AX
+	MOVQ ENTRY_CPU_SELF(GS), AX                        // Load vCPU.
+	REGISTERS_SAVE(AX, CPU_REGISTERS)
+	POPQ BX
+	MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX)
+	MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX)
+	MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX)
+	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX)
+	MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX)
 
 	// Set the error code and adjust the stack.
-	MOVQ 8(SP), AX              // Load the error code.
-	MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
-	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ 8(SP), BX              // Load the error code.
+	MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU.
+	MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel.
 	MOVQ 0(SP), BX              // BX contains the vector.
-	ADDQ $48, SP                // Drop the exception frame.
 
 	// Call the exception trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_SELF(GS), AX     // Load vCPU.
 	PUSHQ BX                  // Second argument (vector).
 	PUSHQ AX                  // First argument (vCPU).
 	CALL ·kernelException(SB) // Call the trampoline.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 9d29b7168..1079a024b 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -27,7 +27,9 @@
 
 // ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
-  WORD $0xd69f03e0
+  WORD $0xd69f03e0; \
+  DSB $7; \
+  ISB $15;
 
 // RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
@@ -44,9 +46,11 @@
 #define SCTLR_M         1 << 0
 #define SCTLR_C         1 << 2
 #define SCTLR_I         1 << 12
+#define SCTLR_DZE       1 << 14
 #define SCTLR_UCT       1 << 15
+#define SCTLR_UCI       1 << 26
 
-#define SCTLR_EL1_DEFAULT       (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT)
+#define SCTLR_EL1_DEFAULT       (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE)
 
 // cntkctl_el1: counter-timer kernel control register el1.
 #define CNTKCTL_EL0PCTEN 	1 << 0
@@ -294,23 +298,27 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
-	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	ISB $15; \
-	DSB $15;
+	WORD $0xd538d092;   //MRS   TPIDR_EL1, R18
 
 // SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
-	MOVD CPU_TTBR0_APP(from), RSV_REG; \
-	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	MRS TTBR1_EL1, R0; \
+	MOVD CPU_APP_ASID(from), R1; \
+	BFI $48, R1, $16, R0; \
+	MSR R0, TTBR1_EL1; \ // set the ASID in TTBR1_EL1 (since TCR.A1 is set)
 	ISB $15; \
-	DSB $15;
+	MOVD CPU_TTBR0_APP(from), RSV_REG; \
+	MSR RSV_REG, TTBR0_EL1;
 
 // SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
-	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
-	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
+	MRS TTBR1_EL1, R0; \
+	MOVD $1, R1; \
+	BFI $48, R1, $16, R0; \
+	MSR R0, TTBR1_EL1; \
 	ISB $15; \
-	DSB $15;
+	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
+	MSR RSV_REG, TTBR0_EL1;
 
 #define VFP_ENABLE \
 	MOVD $FPEN_ENABLE, R0; \
@@ -326,29 +334,30 @@
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
-	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18, step2, switch user pagetable.
-	SWITCH_TO_KVM_PAGETABLE(RSV_REG); \
-	WORD $0xd538d092; \    //MRS   TPIDR_EL1, R18
-	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer.
-	REGISTERS_SAVE(RSV_REG_APP, 0); \          // step4, save app context.
+	WORD $0xd538d092; \    // MRS   TPIDR_EL1, R18
+	MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer.
+	REGISTERS_SAVE(RSV_REG_APP, 0); \          // step3, save app context.
 	MOVD RSV_REG_APP, R20; \
 	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \
 	ADD $16, RSP, RSP; \
 	MOVD RSV_REG, PTRACE_R18(R20); \
 	MOVD RSV_REG_APP, PTRACE_R9(R20); \
-	MOVD R20, RSV_REG_APP; \
+	MRS TPIDR_EL0, R3; \
+	MOVD R3, PTRACE_TLS(R20); \
 	WORD $0xd5384003; \      //  MRS SPSR_EL1, R3
-	MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \
+	MOVD R3, PTRACE_PSTATE(R20); \
 	MRS ELR_EL1, R3; \
-	MOVD R3, PTRACE_PC(RSV_REG_APP); \
+	MOVD R3, PTRACE_PC(R20); \
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
-	MOVD R3, PTRACE_SP(RSV_REG_APP);
+	MOVD R3, PTRACE_SP(R20);
 
 // KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
+	MRS TPIDR_EL0, R4; \
+	MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
@@ -357,6 +366,26 @@
 	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
 	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// EXCEPTION_WITH_ERROR is a common exception handler function.
+#define EXCEPTION_WITH_ERROR(user, vector) \
+	WORD $0xd538d092; \	//MRS   TPIDR_EL1, R18
+	WORD $0xd538601a; \	//MRS   FAR_EL1, R26
+	MOVD R26, CPU_FAULT_ADDR(RSV_REG); \
+	MOVD $user, R3; \
+	MOVD R3, CPU_ERROR_TYPE(RSV_REG); \	// Set error type to user.
+	MOVD $vector, R3; \
+	MOVD R3, CPU_VECTOR_CODE(RSV_REG); \
+	MRS ESR_EL1, R3; \
+	MOVD R3, CPU_ERROR_CODE(RSV_REG); \
+	B ·kernelExitToEl1(SB);
+
+// storeAppASID writes the application's asid value.
+TEXT ·storeAppASID(SB),NOSPLIT,$0-8
+	MOVD asid+0(FP), R1
+	MRS  TPIDR_EL1, RSV_REG
+	MOVD R1, CPU_APP_ASID(RSV_REG)
+	RET
+
 // Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
 	// Clear bluepill.
@@ -365,8 +394,6 @@ TEXT ·Halt(SB),NOSPLIT,$0
 	BNE mmio_exit
 	MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
 
-	// Flush dcache.
-	WORD $0xd5087e52   // DC CISW
 mmio_exit:
 	// Disable fpsimd.
 	WORD $0xd5381041 // MRS CPACR_EL1, R1
@@ -384,9 +411,6 @@ mmio_exit:
 	MRS VBAR_EL1, R9
 	MOVD R0, 0x0(R9)
 
-	// Flush dcahce.
-	WORD $0xd5087e52  // DC CISW
-
 	RET
 
 // HaltAndResume halts execution and point the pointer to the resume function.
@@ -414,7 +438,7 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 	MOVD R8, ret+0(FP)
 	RET
 
-#define STACK_FRAME_SIZE 16
+#define STACK_FRAME_SIZE 32
 
 // kernelExitToEl0 is the entrypoint for application in guest_el0.
 // Prepare the vcpu environment for container application.
@@ -423,6 +447,8 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	MRS TPIDR_EL1, RSV_REG
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+	MRS TPIDR_EL0, R3
+	MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG)
 
 	WORD $0xd5384003    //    MRS SPSR_EL1, R3
 	MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
@@ -449,8 +475,18 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	MOVD PTRACE_PSTATE(RSV_REG_APP), R1
 	WORD $0xd5184001  //MSR R1, SPSR_EL1
 
+	// need use kernel space address to excute below code, since
+	// after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's
+	// ASID.
+	WORD $0x10000061		// ADR R1, do_exit_to_el0
+	ORR $0xffff000000000000, R1, R1
+	JMP (R1)
+
+do_exit_to_el0:
 	// RSV_REG & RSV_REG_APP will be loaded at the end.
 	REGISTERS_LOAD(RSV_REG_APP, 0)
+	MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG
+	MSR RSV_REG, TPIDR_EL0
 
 	// switch to user pagetable.
 	MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
@@ -458,15 +494,16 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 
 	SUB $STACK_FRAME_SIZE, RSP, RSP
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+	STP (R0, R1), 16*1(RSP)
 
 	WORD $0xd538d092    //MRS   TPIDR_EL1, R18
 
 	SWITCH_TO_APP_PAGETABLE(RSV_REG)
 
+	LDP 16*1(RSP), (R0, R1)
 	LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
 	ADD $STACK_FRAME_SIZE, RSP, RSP
 
-	ISB $15
 	ERET()
 
 // kernelExitToEl1 is the entrypoint for sentry in guest_el1.
@@ -482,6 +519,9 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
 	MOVD R1, RSP
 
+	SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+	MRS TPIDR_EL1, RSV_REG
+
 	REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
 	MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
 
@@ -489,8 +529,6 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
 
 // Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
-	// Flush dcache.
-	WORD $0xd5087e52 // DC CISW
 	// Init.
 	MOVD $SCTLR_EL1_DEFAULT, R1
 	MSR R1, SCTLR_EL1
@@ -634,21 +672,7 @@ el0_svc:
 
 el0_da:
 el0_ia:
-	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
-	WORD $0xd538601a     //MRS   FAR_EL1, R26
-
-	MOVD R26, CPU_FAULT_ADDR(RSV_REG)
-
-	MOVD $1, R3
-	MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
-
-	MOVD $PageFault, R3
-	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
-	MRS ESR_EL1, R3
-	MOVD R3, CPU_ERROR_CODE(RSV_REG)
-
-	B ·kernelExitToEl1(SB)
+	EXCEPTION_WITH_ERROR(1, PageFault)
 
 el0_fpsimd_acc:
 	B ·Shutdown(SB)
@@ -663,10 +687,7 @@ el0_sp_pc:
 	B ·Shutdown(SB)
 
 el0_undef:
-	MOVD $El0Sync_undef, R3
-	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
-	B ·kernelExitToEl1(SB)
+	EXCEPTION_WITH_ERROR(1, El0Sync_undef)
 
 el0_dbg:
 	B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 549f3d228..9742308d8 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -24,7 +24,10 @@ go_binary(
         "defs_impl_arm64.go",
         "main.go",
     ],
-    visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+    visibility = [
+        "//pkg/sentry/platform/kvm:__pkg__",
+        "//pkg/sentry/platform/ring0:__pkg__",
+    ],
     deps = [
         "//pkg/cpuid",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 021693791..264be23d3 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -19,8 +19,8 @@ package ring0
 // N.B. that constraints on KernelOpts must be satisfied.
 //
 //go:nosplit
-func (k *Kernel) Init(opts KernelOpts) {
-	k.init(opts)
+func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
+	k.init(opts, maxCPUs)
 }
 
 // Halt halts execution.
@@ -49,6 +49,11 @@ func (defaultHooks) KernelException(Vector) {
 
 // kernelSyscall is a trampoline.
 //
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
 // +checkescape:hard,stack
 //
 //go:nosplit
@@ -58,6 +63,11 @@ func kernelSyscall(c *CPU) {
 
 // kernelException is a trampoline.
 //
+// When in amd64, it is called with %rip on the upper half, so it can
+// NOT access to any global data which is not mapped on upper and must
+// call to function pointers or interfaces to switch to the lower half
+// so that callee can access to global data.
+//
 // +checkescape:hard,stack
 //
 //go:nosplit
@@ -68,10 +78,10 @@ func kernelException(c *CPU, vector Vector) {
 // Init initializes a new CPU.
 //
 // Init allows embedding in other objects.
-func (c *CPU) Init(k *Kernel, hooks Hooks) {
-	c.self = c   // Set self reference.
-	c.kernel = k // Set kernel reference.
-	c.init()     // Perform architectural init.
+func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) {
+	c.self = c    // Set self reference.
+	c.kernel = k  // Set kernel reference.
+	c.init(cpuID) // Perform architectural init.
 
 	// Require hooks.
 	if hooks != nil {
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index d37981dbf..3a9dff4cc 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -18,13 +18,42 @@ package ring0
 
 import (
 	"encoding/binary"
+	"reflect"
+
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
 	// Save the root page tables.
 	k.PageTables = opts.PageTables
 
+	entrySize := reflect.TypeOf(kernelEntry{}).Size()
+	var (
+		entries []kernelEntry
+		padding = 1
+	)
+	for {
+		entries = make([]kernelEntry, maxCPUs+padding-1)
+		totalSize := entrySize * uintptr(maxCPUs+padding-1)
+		addr := reflect.ValueOf(&entries[0]).Pointer()
+		if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize {
+			// The runtime forces power-of-2 alignment for allocations, and we are therefore
+			// safe once the first address is aligned and the chunk is at least a full page.
+			break
+		}
+		padding = padding << 1
+	}
+	k.cpuEntries = entries
+
+	k.globalIDT = &idt64{}
+	if reflect.TypeOf(idt64{}).Size() != usermem.PageSize {
+		panic("Size of globalIDT should be PageSize")
+	}
+	if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 {
+		panic("Allocated globalIDT should be page aligned")
+	}
+
 	// Setup the IDT, which is uniform.
 	for v, handler := range handlers {
 		// Allow Breakpoint and Overflow to be called from all
@@ -39,8 +68,26 @@ func (k *Kernel) init(opts KernelOpts) {
 	}
 }
 
+func (k *Kernel) EntryRegions() map[uintptr]uintptr {
+	regions := make(map[uintptr]uintptr)
+
+	addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer()
+	size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries))
+	end, _ := usermem.Addr(addr + size).RoundUp()
+	regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+	addr = reflect.ValueOf(k.globalIDT).Pointer()
+	size = reflect.TypeOf(idt64{}).Size()
+	end, _ = usermem.Addr(addr + size).RoundUp()
+	regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end)
+
+	return regions
+}
+
 // init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
+	c.kernelEntry = &c.kernel.cpuEntries[cpuID]
+	c.cpuSelf = c
 	// Null segment.
 	c.gdt[0].setNull()
 
@@ -65,6 +112,7 @@ func (c *CPU) init() {
 
 	// Set the kernel stack pointer in the TSS (virtual address).
 	stackAddr := c.StackTop()
+	c.stackTop = stackAddr
 	c.tss.rsp0Lo = uint32(stackAddr)
 	c.tss.rsp0Hi = uint32(stackAddr >> 32)
 	c.tss.ist1Lo = uint32(stackAddr)
@@ -183,7 +231,7 @@ func IsCanonical(addr uint64) bool {
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
-	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+	c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID))
 
 	// Sanitize registers.
 	regs := switchOpts.Registers
@@ -197,15 +245,11 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
 	WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
 	LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
-	jumpToKernel()                                   // Switch to upper half.
-	writeCR3(uintptr(userCR3))                       // Change to user address space.
 	if switchOpts.FullRestore {
-		vector = iret(c, regs)
+		vector = iret(c, regs, uintptr(userCR3))
 	} else {
-		vector = sysret(c, regs)
+		vector = sysret(c, regs, uintptr(userCR3))
 	}
-	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
-	jumpToUser()                                     // Return to lower half.
 	SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
 	WriteFS(uintptr(c.registers.Fs_base))            // escapes: no. Restore kernel FS.
 	return
@@ -219,7 +263,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 //go:nosplit
 func start(c *CPU) {
 	// Save per-cpu & FS segment.
-	WriteGS(kernelAddr(c))
+	WriteGS(kernelAddr(c.kernelEntry))
 	WriteFS(uintptr(c.registers.Fs_base))
 
 	// Initialize floating point.
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index d0afa1aaa..b294ccc7c 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,13 +25,13 @@ func HaltAndResume()
 func HaltEl1SvcAndResume()
 
 // init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts) {
+func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
 	// Save the root page tables.
 	k.PageTables = opts.PageTables
 }
 
 // init initializes architecture-specific state.
-func (c *CPU) init() {
+func (c *CPU) init(cpuID int) {
 	// Set the kernel stack pointer(virtual address).
 	c.registers.Sp = uint64(c.StackTop())
 
@@ -53,17 +53,20 @@ func IsCanonical(addr uint64) bool {
 
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+	storeAppASID(uintptr(switchOpts.UserASID))
+	if switchOpts.Flush {
+		FlushTlbAll()
+	}
+
 	regs := switchOpts.Registers
 
 	regs.Pstate &= ^uint64(PsrFlagsClear)
 	regs.Pstate |= UserFlagsSet
 
 	LoadFloatingPoint(switchOpts.FloatingPointState)
-	SetTLS(regs.TPIDR_EL0)
 
 	kernelExitToEl0()
 
-	regs.TPIDR_EL0 = GetTLS()
 	SaveFloatingPoint(switchOpts.FloatingPointState)
 
 	vector = c.vecCode
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index ca968a036..0ec5c3bc5 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -61,21 +61,9 @@ func wrgsbase(addr uintptr)
 // wrgsmsr writes to the GS_BASE MSR.
 func wrgsmsr(addr uintptr)
 
-// writeCR3 writes the CR3 value.
-func writeCR3(phys uintptr)
-
-// readCR3 reads the current CR3 value.
-func readCR3() uintptr
-
 // readCR2 reads the current CR2 value.
 func readCR2() uintptr
 
-// jumpToKernel jumps to the kernel version of the current RIP.
-func jumpToKernel()
-
-// jumpToUser jumps to the user version of the current RIP.
-func jumpToUser()
-
 // fninit initializes the floating point unit.
 func fninit()
 
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 75d742750..2fe83568a 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -127,53 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
 	BYTE $0x0f; BYTE $0x30;  // WRMSR
 	RET
 
-// jumpToUser changes execution to the user address.
-//
-// This works by changing the return value to the user version.
-TEXT ·jumpToUser(SB),NOSPLIT,$0
-	MOVQ 0(SP), AX
-	MOVQ ·KernelStartAddress(SB), BX
-	NOTQ BX
-	ANDQ BX, SP // Switch the stack.
-	ANDQ BX, BP // Switch the frame pointer.
-	ANDQ BX, AX // Future return value.
-	MOVQ AX, 0(SP)
-	RET
-
-// jumpToKernel changes execution to the kernel address space.
-//
-// This works by changing the return value to the kernel version.
-TEXT ·jumpToKernel(SB),NOSPLIT,$0
-	MOVQ 0(SP), AX
-	MOVQ ·KernelStartAddress(SB), BX
-	ORQ BX, SP // Switch the stack.
-	ORQ BX, BP // Switch the frame pointer.
-	ORQ BX, AX // Future return value.
-	MOVQ AX, 0(SP)
-	RET
-
-// writeCR3 writes the given CR3 value.
-//
-// The code corresponds to:
-//
-// 	mov %rax, %cr3
-//
-TEXT ·writeCR3(SB),NOSPLIT,$0-8
-	MOVQ cr3+0(FP), AX
-	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
-	RET
-
-// readCR3 reads the current CR3 value.
-//
-// The code corresponds to:
-//
-// 	mov %cr3, %rax
-//
-TEXT ·readCR3(SB),NOSPLIT,$0-8
-	BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
-	MOVQ AX, ret+0(FP)
-	RET
-
 // readCR2 reads the current CR2 value.
 //
 // The code corresponds to:
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 00e52c8af..d91a09de1 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,6 +16,15 @@
 
 package ring0
 
+// storeAppASID writes the application's asid value.
+func storeAppASID(asid uintptr)
+
+// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU.
+func LocalFlushTlbAll()
+
+// FlushTlbAll flush all tlb.
+func FlushTlbAll()
+
 // CPACREL1 returns the value of the CPACR_EL1 register.
 func CPACREL1() (value uintptr)
 
@@ -44,12 +53,6 @@ func LoadFloatingPoint(*byte)
 // SaveFloatingPoint saves floating point state.
 func SaveFloatingPoint(*byte)
 
-// GetTLS returns the value of TPIDR_EL0 register.
-func GetTLS() (value uint64)
-
-// SetTLS writes the TPIDR_EL0 value.
-func SetTLS(value uint64)
-
 // Init sets function pointers based on architectural features.
 //
 // This must be called prior to using ring0.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 86bfbe46f..da9d3cf55 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,14 +15,18 @@
 #include "funcdata.h"
 #include "textflag.h"
 
-TEXT ·GetTLS(SB),NOSPLIT,$0-8
-	MRS TPIDR_EL0, R1
-	MOVD R1, ret+0(FP)
+TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0
+	DSB $6			// dsb(nshst)
+	WORD $0xd508871f	// __tlbi(vmalle1)
+	DSB $7			// dsb(nsh)
+	ISB $15
 	RET
 
-TEXT ·SetTLS(SB),NOSPLIT,$0-8
-	MOVD addr+0(FP), R1
-	MSR R1, TPIDR_EL0
+TEXT ·FlushTlbAll(SB),NOSPLIT,$0
+	DSB $10			// dsb(ishst)
+	WORD $0xd508831f	// __tlbi(vmalle1is)
+	DSB $11			// dsb(ish)
+	ISB $15
 	RET
 
 TEXT ·CPACREL1(SB),NOSPLIT,$0-8
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index b8ab120a0..ca4075b09 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -30,14 +30,21 @@ func Emit(w io.Writer) {
 
 	c := &CPU{}
 	fmt.Fprintf(w, "\n// CPU offsets.\n")
-	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
-	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
 	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ENTRY            0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
+
+	e := &kernelEntry{}
+	fmt.Fprintf(w, "\n// CPU entry offsets.\n")
+	fmt.Fprintf(w, "#define ENTRY_SCRATCH0       0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_STACK_TOP      0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_CPU_SELF       0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
+	fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3     0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _RFLAGS_IOPL0         0x%02x\n", _RFLAGS_IOPL0)
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
 
 	fmt.Fprintf(w, "\n// Vectors.\n")
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index f3de962f0..45eba960d 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -41,6 +41,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_VECTOR_CODE      0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_APP_ADDR         0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_LAZY_VFP         0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_APP_ASID         0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
@@ -124,4 +125,5 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define PTRACE_SP       0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_PC       0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_PSTATE   0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_TLS      0x%02x\n", reflect.ValueOf(&p.TPIDR_EL0).Pointer()-reflect.ValueOf(p).Pointer())
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 6409d1d91..520161755 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -78,7 +78,7 @@ const (
 
 const (
 	executeDisable = xn
-	optionMask     = 0xfff | 0xfff<<48
+	optionMask     = 0xfff | 0xffff<<48
 	protDefault    = accessed | shared
 )
 
@@ -188,7 +188,7 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
 		v |= mtNormal
 	} else {
 		v = v &^ user
-		v |= mtDevicenGnRE // Strong order for the addresses with ring0.KernelStartAddress.
+		v |= mtNormal
 	}
 	atomic.StoreUintptr((*uintptr)(p), v)
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 1a49f12a2..5ddd10256 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -36,7 +36,7 @@ const (
 	pudSize = 1 << pudShift
 	pgdSize = 1 << pgdShift
 
-	ttbrASIDOffset = 55
+	ttbrASIDOffset = 48
 	ttbrASIDMask   = 0xff
 
 	entriesPerPage = 512
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 9da0ea685..34fbc1c35 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -39,7 +39,9 @@ const (
 
 	_RFLAGS_AC       = 1 << 18
 	_RFLAGS_NT       = 1 << 14
-	_RFLAGS_IOPL     = 3 << 12
+	_RFLAGS_IOPL0    = 1 << 12
+	_RFLAGS_IOPL1    = 1 << 13
+	_RFLAGS_IOPL     = _RFLAGS_IOPL0 | _RFLAGS_IOPL1
 	_RFLAGS_DF       = 1 << 10
 	_RFLAGS_IF       = 1 << 9
 	_RFLAGS_STEP     = 1 << 8
@@ -67,15 +69,45 @@ const (
 	KernelFlagsSet = _RFLAGS_RESERVED
 
 	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+	//
+	// _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege
+	// level. The Current Privilege Level (CPL) of the task must be less
+	// than or equal to the IOPL in order for the task or program to access
+	// I/O ports.
+	//
+	// Here, _RFLAGS_IOPL0 is used only to determine whether the task is
+	// running in the kernel or userspace mode. In the user mode, the CPL is
+	// always 3 and it doesn't matter what IOPL is set if it is bellow CPL.
+	//
+	// We need to have one bit which will be always different in user and
+	// kernel modes. And we have to remember that even though we have
+	// KernelFlagsClear, we still can see some of these flags in the kernel
+	// mode. This can happen when the goruntime switches on a goroutine
+	// which has been saved in the host mode. On restore, the popf
+	// instruction is used to restore flags and this means that all flags
+	// what the goroutine has in the host mode will be restored in the
+	// kernel mode.
+	//
+	// _RFLAGS_IOPL0 is never set in host and kernel modes and we always set
+	// it in the user mode. So if this flag is set, the task is running in
+	// the user mode and if it isn't set, the task is running in the kernel
+	// mode.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0
 
 	// KernelFlagsClear should always be clear in the kernel.
 	KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
 
 	// UserFlagsClear are always cleared in userspace.
-	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1
 )
 
+// IsKernelFlags returns true if rflags coresponds to the kernel mode.
+//
+// go:nosplit
+func IsKernelFlags(rflags uint64) bool {
+	return rflags&_RFLAGS_IOPL0 == 0
+}
+
 // Vector is an exception vector.
 type Vector uintptr
 
@@ -104,7 +136,7 @@ const (
 	VirtualizationException
 	SecurityException = 0x1e
 	SyscallInt80      = 0x80
-	_NR_INTERRUPTS    = SyscallInt80 + 1
+	_NR_INTERRUPTS    = 0x100
 )
 
 // System call vectors.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index c0fd3425b..a3f775d15 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/marshal",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -20,6 +21,5 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go
index d9621968c..37d02948f 100644
--- a/pkg/sentry/socket/control/control_vfs2.go
+++ b/pkg/sentry/socket/control/control_vfs2.go
@@ -24,6 +24,8 @@ import (
 )
 
 // SCMRightsVFS2 represents a SCM_RIGHTS socket control message.
+//
+// +stateify savable
 type SCMRightsVFS2 interface {
 	transport.RightsControlMessage
 
@@ -34,9 +36,11 @@ type SCMRightsVFS2 interface {
 	Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool)
 }
 
-// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
-// maintained for each vfs.FileDescription and is release either when an FD is created or
-// when the Release method is called.
+// RightsFilesVFS2 represents a SCM_RIGHTS socket control message. A reference
+// is maintained for each vfs.FileDescription and is release either when an FD
+// is created or when the Release method is called.
+//
+// +stateify savable
 type RightsFilesVFS2 []*vfs.FileDescription
 
 // NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e76e498de..b6ebe29d6 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -21,6 +21,8 @@ go_library(
         "//pkg/context",
         "//pkg/fdnotifier",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
@@ -37,11 +39,12 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 242e6bf76..7d3c4a01c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -36,8 +38,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const (
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 8a1d52ebf..9a2cac40b 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// +stateify savable
 type socketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -51,7 +52,8 @@ var _ = socket.SocketVFS2(&socketVFS2{})
 
 func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
 	mnt := t.Kernel().SocketMount()
-	d := sockfs.NewDentry(t.Credentials(), mnt)
+	d := sockfs.NewDentry(t, mnt)
+	defer d.DecRef(t)
 
 	s := &socketVFS2{
 		socketOpsCommon: socketOpsCommon{
@@ -77,6 +79,13 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
 	return vfsfd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *socketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
@@ -97,11 +106,6 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
 	return ioctl(ctx, s.fd, uio, args)
 }
 
-// Allocate implements vfs.FileDescriptionImpl.Allocate.
-func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
-	return syserror.ENODEV
-}
-
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	return 0, syserror.ESPIPE
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index fda3dcb35..7e7857ac3 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -59,6 +62,8 @@ type Stack struct {
 	tcpSACKEnabled bool
 	netDevFile     *os.File
 	netSNMPFile    *os.File
+	ipv4Forwarding bool
+	ipv6Forwarding bool
 }
 
 // NewStack returns an empty Stack containing no configuration.
@@ -118,6 +123,13 @@ func (s *Stack) Configure() error {
 		s.netSNMPFile = f
 	}
 
+	s.ipv6Forwarding = false
+	if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil {
+		s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0"
+	} else {
+		log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false")
+	}
+
 	return nil
 }
 
@@ -312,7 +324,12 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 }
 
 // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
-func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+func (s *Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
+// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr.
+func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error {
 	return syserror.EACCES
 }
 
@@ -347,7 +364,7 @@ func (s *Stack) TCPSACKEnabled() (bool, error) {
 }
 
 // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
-func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
+func (s *Stack) SetTCPSACKEnabled(bool) error {
 	return syserror.EACCES
 }
 
@@ -357,7 +374,7 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
 }
 
 // SetTCPRecovery implements inet.Stack.SetTCPRecovery.
-func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+func (s *Stack) SetTCPRecovery(inet.TCPLossRecovery) error {
 	return syserror.EACCES
 }
 
@@ -418,18 +435,18 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 	}
 
 	if rawLine == "" {
-		return fmt.Errorf("Failed to get raw line")
+		return fmt.Errorf("failed to get raw line")
 	}
 
 	parts := strings.SplitN(rawLine, ":", 2)
 	if len(parts) != 2 {
-		return fmt.Errorf("Failed to get prefix from: %q", rawLine)
+		return fmt.Errorf("failed to get prefix from: %q", rawLine)
 	}
 
 	sliceStat = toSlice(stat)
 	fields := strings.Fields(strings.TrimSpace(parts[1]))
 	if len(fields) != len(sliceStat) {
-		return fmt.Errorf("Failed to parse fields: %q", rawLine)
+		return fmt.Errorf("failed to parse fields: %q", rawLine)
 	}
 	if _, ok := stat.(*inet.StatSNMPTCP); ok {
 		snmpTCP = true
@@ -445,7 +462,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64)
 		}
 		if err != nil {
-			return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err)
+			return fmt.Errorf("failed to parse field %d from: %q, %v", i, rawLine, err)
 		}
 	}
 
@@ -468,3 +485,21 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil }
 
 // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
 func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {}
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	switch protocol {
+	case ipv4.ProtocolNumber:
+		return s.ipv4Forwarding
+	case ipv6.ProtocolNumber:
+		return s.ipv6Forwarding
+	default:
+		log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol)
+		return false
+	}
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error {
+	return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 721094bbf..8aea0200f 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "netfilter",
     srcs = [
         "extensions.go",
+        "ipv4.go",
+        "ipv6.go",
         "netfilter.go",
         "owner_matcher.go",
         "targets.go",
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 0336a32d8..549787955 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,7 +39,7 @@ type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
-	// marshal converts from an stack.Matcher to an ABI struct.
+	// marshal converts from a stack.Matcher to an ABI struct.
 	marshal(matcher stack.Matcher) []byte
 
 	// unmarshal converts from the ABI matcher struct to an
@@ -93,3 +95,71 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf
 	}
 	return matchMaker.unmarshal(buf, filter)
 }
+
+// targetMaker knows how to (un)marshal a target. Once registered,
+// marshalTarget and unmarshalTarget can be used.
+type targetMaker interface {
+	// id uniquely identifies the target.
+	id() stack.TargetID
+
+	// marshal converts from a stack.Target to an ABI struct.
+	marshal(target stack.Target) []byte
+
+	// unmarshal converts from the ABI matcher struct to a stack.Target.
+	unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error)
+}
+
+// targetMakers maps the TargetID of supported targets to the targetMaker that
+// marshals and unmarshals it. It is immutable after package initialization.
+var targetMakers = map[stack.TargetID]targetMaker{}
+
+func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
+	tid := stack.TargetID{
+		Name:            name,
+		NetworkProtocol: netProto,
+		Revision:        rev,
+	}
+	if _, ok := targetMakers[tid]; !ok {
+		return 0, false
+	}
+
+	// Return the highest supported revision unless rev is higher.
+	for _, other := range targetMakers {
+		otherID := other.id()
+		if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev {
+			rev = uint8(otherID.Revision)
+		}
+	}
+	return rev, true
+}
+
+// registerTargetMaker should be called by target extensions to register them
+// with the netfilter package.
+func registerTargetMaker(tm targetMaker) {
+	if _, ok := targetMakers[tm.id()]; ok {
+		panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id()))
+	}
+	targetMakers[tm.id()] = tm
+}
+
+func marshalTarget(target stack.Target) []byte {
+	targetMaker, ok := targetMakers[target.ID()]
+	if !ok {
+		panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID()))
+	}
+	return targetMaker.marshal(target)
+}
+
+func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) {
+	tid := stack.TargetID{
+		Name:            target.Name.String(),
+		NetworkProtocol: filter.NetworkProtocol(),
+		Revision:        target.Revision,
+	}
+	targetMaker, ok := targetMakers[tid]
+	if !ok {
+		nflog("unsupported target with name %q", target.Name.String())
+		return nil, syserr.ErrInvalidArgument
+	}
+	return targetMaker.unmarshal(buf, filter)
+}
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
new file mode 100644
index 000000000..b560fae0d
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -0,0 +1,265 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv4Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv4Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary4 converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), false)
+	if !ok {
+		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct.
+	entries, info := getEntries4(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
+	var entries linux.KernelIPTGetEntries
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIPTEntry{
+			Entry: linux.IPTEntry{
+				IP: linux.IPTIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIPTEntry,
+				TargetOffset: linux.SizeOfIPTEntry,
+			},
+		}
+		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries, info
+}
+
+func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIPTEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IPTEntry
+		buf := optVal[:linux.SizeOfIPTEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIPTEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIPTEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIPTIP(entry.IP)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		rule := stack.Rule{
+			Filter:   filter,
+			Matchers: matchers,
+		}
+
+		{
+			target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */)
+			if err != nil {
+				nflog("failed to parse target: %v", err)
+				return nil, err
+			}
+			rule.Target = target
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, rule)
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields4(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// A Protocol value of 0 indicates all protocols match.
+		CheckProtocol:         iptip.Protocol != 0,
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields4(iptip linux.IPTIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags != 0 ||
+		iptip.InverseFlags&^inverseMask != 0
+}
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
new file mode 100644
index 000000000..4253f7bf4
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -0,0 +1,270 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// emptyIPv6Filter is for comparison with a rule's filters to determine whether
+// it is also empty. It is immutable.
+var emptyIPv6Filter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+}
+
+// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a little data, reading some
+// offsets, jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) {
+	// The table name has to fit in the struct.
+	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	}
+
+	table, ok := stk.IPTables().GetTable(tablename.String(), true)
+	if !ok {
+		return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+	}
+
+	// Setup the info struct, which is the same in IPv4 and IPv6.
+	entries, info := getEntries6(table, tablename)
+	return entries, info, nil
+}
+
+func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) {
+	var info linux.IPTGetinfo
+	var entries linux.KernelIP6TGetEntries
+	copy(info.Name[:], tablename[:])
+	copy(entries.Name[:], info.Name[:])
+	info.ValidHooks = table.ValidHooks()
+
+	for ruleIdx, rule := range table.Rules {
+		nflog("convert to binary: current offset: %d", entries.Size)
+
+		setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
+		// Each rule corresponds to an entry.
+		entry := linux.KernelIP6TEntry{
+			Entry: linux.IP6TEntry{
+				IPv6: linux.IP6TIP{
+					Protocol: uint16(rule.Filter.Protocol),
+				},
+				NextOffset:   linux.SizeOfIP6TEntry,
+				TargetOffset: linux.SizeOfIP6TEntry,
+			},
+		}
+		copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst)
+		copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask)
+		copy(entry.Entry.IPv6.Src[:], rule.Filter.Src)
+		copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT
+		}
+		if rule.Filter.CheckProtocol {
+			entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO
+		}
+
+		for _, matcher := range rule.Matchers {
+			// Serialize the matcher and add it to the
+			// entry.
+			serialized := marshalMatcher(matcher)
+			nflog("convert to binary: matcher serialized as: %v", serialized)
+			if len(serialized)%8 != 0 {
+				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
+			}
+			entry.Elems = append(entry.Elems, serialized...)
+			entry.Entry.NextOffset += uint16(len(serialized))
+			entry.Entry.TargetOffset += uint16(len(serialized))
+		}
+
+		// Serialize and append the target.
+		serialized := marshalTarget(rule.Target)
+		if len(serialized)%8 != 0 {
+			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+		}
+		entry.Elems = append(entry.Elems, serialized...)
+		entry.Entry.NextOffset += uint16(len(serialized))
+
+		nflog("convert to binary: adding entry: %+v", entry)
+
+		entries.Size += uint32(entry.Entry.NextOffset)
+		entries.Entrytable = append(entries.Entrytable, entry)
+		info.NumEntries++
+	}
+
+	info.Size = entries.Size
+	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
+	return entries, info
+}
+
+func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
+	nflog("set entries: setting entries in table %q", replace.Name.String())
+
+	// Convert input into a list of rules and their offsets.
+	var offset uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
+	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+		nflog("set entries: processing entry at offset %d", offset)
+
+		// Get the struct ipt_entry.
+		if len(optVal) < linux.SizeOfIP6TEntry {
+			nflog("optVal has insufficient size for entry %d", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		var entry linux.IP6TEntry
+		buf := optVal[:linux.SizeOfIP6TEntry]
+		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+		initialOptValLen := len(optVal)
+		optVal = optVal[linux.SizeOfIP6TEntry:]
+
+		if entry.TargetOffset < linux.SizeOfIP6TEntry {
+			nflog("entry has too-small target offset %d", entry.TargetOffset)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): We should support more IPTIP
+		// filtering fields.
+		filter, err := filterFromIP6TIP(entry.IPv6)
+		if err != nil {
+			nflog("bad iptip: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
+		// that they only work for certain protocols, hooks, tables.
+		// Get matchers.
+		matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry
+		if len(optVal) < int(matchersSize) {
+			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+		matchers, err := parseMatchers(filter, optVal[:matchersSize])
+		if err != nil {
+			nflog("failed to parse matchers: %v", err)
+			return nil, syserr.ErrInvalidArgument
+		}
+		optVal = optVal[matchersSize:]
+
+		// Get the target of the rule.
+		targetSize := entry.NextOffset - entry.TargetOffset
+		if len(optVal) < int(targetSize) {
+			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		rule := stack.Rule{
+			Filter:   filter,
+			Matchers: matchers,
+		}
+
+		{
+			target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */)
+			if err != nil {
+				nflog("failed to parse target: %v", err)
+				return nil, err
+			}
+			rule.Target = target
+		}
+		optVal = optVal[targetSize:]
+
+		table.Rules = append(table.Rules, rule)
+		offsets[offset] = int(entryIdx)
+		offset += uint32(entry.NextOffset)
+
+		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
+			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return nil, syserr.ErrInvalidArgument
+		}
+	}
+	return offsets, nil
+}
+
+func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) {
+	if containsUnsupportedFields6(iptip) {
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
+	}
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+		// In ip6tables a flag controls whether to check the protocol.
+		CheckProtocol:         iptip.Flags&linux.IP6T_F_PROTO != 0,
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0,
+	}, nil
+}
+
+func containsUnsupportedFields6(iptip linux.IP6TIP) bool {
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
+	var emptyInterface = [linux.IFNAMSIZ]byte{}
+	flagMask := uint8(linux.IP6T_F_PROTO)
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
+		iptip.InputInterfaceMask != emptyInterface ||
+		iptip.Flags&^flagMask != 0 ||
+		iptip.InverseFlags&^inverseMask != 0 ||
+		iptip.TOS != 0
+}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index e91b0624c..904a12e38 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,7 +17,6 @@
 package netfilter
 
 import (
-	"bytes"
 	"errors"
 	"fmt"
 
@@ -27,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,15 +35,6 @@ import (
 // developing iptables, but can pollute sentry logs otherwise.
 const enableLogging = false
 
-// emptyFilter is for comparison with a rule's filters to determine whether it
-// is also empty. It is immutable.
-var emptyFilter = stack.IPHeaderFilter{
-	Dst:     "\x00\x00\x00\x00",
-	DstMask: "\x00\x00\x00\x00",
-	Src:     "\x00\x00\x00\x00",
-	SrcMask: "\x00\x00\x00\x00",
-}
-
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
 	if enableLogging && log.IsLogging(log.Debug) {
@@ -54,14 +43,19 @@ func nflog(format string, args ...interface{}) {
 }
 
 // GetInfo returns information about iptables.
-func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
 	// Read in the struct and table name.
 	var info linux.IPTGetinfo
 	if _, err := info.CopyIn(t, outPtr); err != nil {
 		return linux.IPTGetinfo{}, syserr.FromError(err)
 	}
 
-	_, info, err := convertNetstackToBinary(stack, info.Name)
+	var err error
+	if ipv6 {
+		_, info, err = convertNetstackToBinary6(stack, info.Name)
+	} else {
+		_, info, err = convertNetstackToBinary4(stack, info.Name)
+	}
 	if err != nil {
 		nflog("couldn't convert iptables: %v", err)
 		return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
@@ -71,8 +65,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT
 	return info, nil
 }
 
-// GetEntries returns netstack's iptables rules encoded for the iptables tool.
-func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+// GetEntries4 returns netstack's iptables rules.
+func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
 	// Read in the struct and table name.
 	var userEntries linux.IPTGetEntries
 	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
@@ -82,7 +76,7 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 
 	// Convert netstack's iptables rules to something that the iptables
 	// tool can understand.
-	entries, _, err := convertNetstackToBinary(stack, userEntries.Name)
+	entries, _, err := convertNetstackToBinary4(stack, userEntries.Name)
 	if err != nil {
 		nflog("couldn't read entries: %v", err)
 		return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
@@ -95,112 +89,53 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-// convertNetstackToBinary converts the iptables as stored in netstack to the
-// format expected by the iptables tool. Linux stores each table as a binary
-// blob that can only be traversed by parsing a bit, reading some offsets,
-// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(stack *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
-	table, ok := stack.IPTables().GetTable(tablename.String())
-	if !ok {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
+// GetEntries6 returns netstack's ip6tables rules.
+func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) {
+	// Read in the struct and table name. IPv4 and IPv6 utilize structs
+	// with the same layout.
+	var userEntries linux.IPTGetEntries
+	if _, err := userEntries.CopyIn(t, outPtr); err != nil {
+		nflog("couldn't copy in entries %q", userEntries.Name)
+		return linux.KernelIP6TGetEntries{}, syserr.FromError(err)
 	}
 
-	var entries linux.KernelIPTGetEntries
-	var info linux.IPTGetinfo
-	info.ValidHooks = table.ValidHooks()
-
-	// The table name has to fit in the struct.
-	if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
-		return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
+	// Convert netstack's iptables rules to something that the iptables
+	// tool can understand.
+	entries, _, err := convertNetstackToBinary6(stack, userEntries.Name)
+	if err != nil {
+		nflog("couldn't read entries: %v", err)
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
+	}
+	if binary.Size(entries) > uintptr(outLen) {
+		nflog("insufficient GetEntries output size: %d", uintptr(outLen))
+		return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
 	}
-	copy(info.Name[:], tablename[:])
-	copy(entries.Name[:], tablename[:])
-
-	for ruleIdx, rule := range table.Rules {
-		nflog("convert to binary: current offset: %d", entries.Size)
-
-		// Is this a chain entry point?
-		for hook, hookRuleIdx := range table.BuiltinChains {
-			if hookRuleIdx == ruleIdx {
-				nflog("convert to binary: found hook %d at offset %d", hook, entries.Size)
-				info.HookEntry[hook] = entries.Size
-			}
-		}
-		// Is this a chain underflow point?
-		for underflow, underflowRuleIdx := range table.Underflows {
-			if underflowRuleIdx == ruleIdx {
-				nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size)
-				info.Underflow[underflow] = entries.Size
-			}
-		}
 
-		// Each rule corresponds to an entry.
-		entry := linux.KernelIPTEntry{
-			Entry: linux.IPTEntry{
-				IP: linux.IPTIP{
-					Protocol: uint16(rule.Filter.Protocol),
-				},
-				NextOffset:   linux.SizeOfIPTEntry,
-				TargetOffset: linux.SizeOfIPTEntry,
-			},
-		}
-		copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
-		copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
-		copy(entry.Entry.IP.Src[:], rule.Filter.Src)
-		copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
-		copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
-		copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
-		if rule.Filter.DstInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
-		}
-		if rule.Filter.SrcInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
-		}
-		if rule.Filter.OutputInterfaceInvert {
-			entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
-		}
+	return entries, nil
+}
 
-		for _, matcher := range rule.Matchers {
-			// Serialize the matcher and add it to the
-			// entry.
-			serialized := marshalMatcher(matcher)
-			nflog("convert to binary: matcher serialized as: %v", serialized)
-			if len(serialized)%8 != 0 {
-				panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
-			}
-			entry.Elems = append(entry.Elems, serialized...)
-			entry.Entry.NextOffset += uint16(len(serialized))
-			entry.Entry.TargetOffset += uint16(len(serialized))
+// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
+// or underflow, in which case it fills in info.HookEntry and info.Underflows.
+func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) {
+	// Is this a chain entry point?
+	for hook, hookRuleIdx := range table.BuiltinChains {
+		if hookRuleIdx == ruleIdx {
+			nflog("convert to binary: found hook %d at offset %d", hook, offset)
+			info.HookEntry[hook] = offset
 		}
-
-		// Serialize and append the target.
-		serialized := marshalTarget(rule.Target)
-		if len(serialized)%8 != 0 {
-			panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
+	}
+	// Is this a chain underflow point?
+	for underflow, underflowRuleIdx := range table.Underflows {
+		if underflowRuleIdx == ruleIdx {
+			nflog("convert to binary: found underflow %d at offset %d", underflow, offset)
+			info.Underflow[underflow] = offset
 		}
-		entry.Elems = append(entry.Elems, serialized...)
-		entry.Entry.NextOffset += uint16(len(serialized))
-
-		nflog("convert to binary: adding entry: %+v", entry)
-
-		entries.Size += uint32(entry.Entry.NextOffset)
-		entries.Entrytable = append(entries.Entrytable, entry)
-		info.NumEntries++
 	}
-
-	nflog("convert to binary: finished with an marshalled size of %d", info.Size)
-	info.Size = entries.Size
-	return entries, info, nil
 }
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
-	// Get the basic rules data (struct ipt_replace).
-	if len(optVal) < linux.SizeOfIPTReplace {
-		nflog("optVal has insufficient size for replace %d", len(optVal))
-		return syserr.ErrInvalidArgument
-	}
+func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
 	var replace linux.IPTReplace
 	replaceBuf := optVal[:linux.SizeOfIPTReplace]
 	optVal = optVal[linux.SizeOfIPTReplace:]
@@ -218,79 +153,15 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	nflog("set entries: setting entries in table %q", replace.Name.String())
-
-	// Convert input into a list of rules and their offsets.
-	var offset uint32
-	// offsets maps rule byte offsets to their position in table.Rules.
-	offsets := map[uint32]int{}
-	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
-		nflog("set entries: processing entry at offset %d", offset)
-
-		// Get the struct ipt_entry.
-		if len(optVal) < linux.SizeOfIPTEntry {
-			nflog("optVal has insufficient size for entry %d", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		var entry linux.IPTEntry
-		buf := optVal[:linux.SizeOfIPTEntry]
-		binary.Unmarshal(buf, usermem.ByteOrder, &entry)
-		initialOptValLen := len(optVal)
-		optVal = optVal[linux.SizeOfIPTEntry:]
-
-		if entry.TargetOffset < linux.SizeOfIPTEntry {
-			nflog("entry has too-small target offset %d", entry.TargetOffset)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): We should support more IPTIP
-		// filtering fields.
-		filter, err := filterFromIPTIP(entry.IP)
-		if err != nil {
-			nflog("bad iptip: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-
-		// TODO(gvisor.dev/issue/170): Matchers and targets can specify
-		// that they only work for certain protocols, hooks, tables.
-		// Get matchers.
-		matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
-		if len(optVal) < int(matchersSize) {
-			nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		matchers, err := parseMatchers(filter, optVal[:matchersSize])
-		if err != nil {
-			nflog("failed to parse matchers: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[matchersSize:]
-
-		// Get the target of the rule.
-		targetSize := entry.NextOffset - entry.TargetOffset
-		if len(optVal) < int(targetSize) {
-			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
-			return syserr.ErrInvalidArgument
-		}
-		target, err := parseTarget(filter, optVal[:targetSize])
-		if err != nil {
-			nflog("failed to parse target: %v", err)
-			return syserr.ErrInvalidArgument
-		}
-		optVal = optVal[targetSize:]
-
-		table.Rules = append(table.Rules, stack.Rule{
-			Filter:   filter,
-			Target:   target,
-			Matchers: matchers,
-		})
-		offsets[offset] = int(entryIdx)
-		offset += uint32(entry.NextOffset)
-
-		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
-			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
-			return syserr.ErrInvalidArgument
-		}
+	var err *syserr.Error
+	var offsets map[uint32]int
+	if ipv6 {
+		offsets, err = modifyEntries6(stk, optVal, &replace, &table)
+	} else {
+		offsets, err = modifyEntries4(stk, optVal, &replace, &table)
+	}
+	if err != nil {
+		return err
 	}
 
 	// Go through the list of supported hooks for this table and, for each
@@ -305,7 +176,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
-					if !validUnderflow(table.Rules[ruleIdx]) {
+					if !validUnderflow(table.Rules[ruleIdx], ipv6) {
 						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
 						return syserr.ErrInvalidArgument
 					}
@@ -323,9 +194,9 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 		}
 	}
 
-	// Add the user chains.
+	// Check the user chains.
 	for ruleIdx, rule := range table.Rules {
-		if _, ok := rule.Target.(stack.UserChainTarget); !ok {
+		if _, ok := rule.Target.(*stack.UserChainTarget); !ok {
 			continue
 		}
 
@@ -346,7 +217,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Set each jump to point to the appropriate rule. Right now they hold byte
 	// offsets.
 	for ruleIdx, rule := range table.Rules {
-		jump, ok := rule.Target.(JumpTarget)
+		jump, ok := rule.Target.(*JumpTarget)
 		if !ok {
 			continue
 		}
@@ -370,7 +241,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 			if ruleIdx == stack.HookUnset {
 				continue
 			}
-			if !isUnconditionalAccept(table.Rules[ruleIdx]) {
+			if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -382,7 +253,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table))
+	return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6))
+
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -404,7 +276,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 
 		// Check some invariants.
 		if match.MatchSize < linux.SizeOfXTEntryMatch {
-
 			return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
 		}
 		if len(optVal) < int(match.MatchSize) {
@@ -429,79 +300,26 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher,
 	return matchers, nil
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
-	if containsUnsupportedFields(iptip) {
-		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
-	}
-	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
-	}
-	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
-		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
-	}
-
-	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterface)
-	}
-	ifname := string(iptip.OutputInterface[:n])
-
-	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
-	if n == -1 {
-		n = len(iptip.OutputInterfaceMask)
-	}
-	ifnameMask := string(iptip.OutputInterfaceMask[:n])
-
-	return stack.IPHeaderFilter{
-		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
-		Dst:                   tcpip.Address(iptip.Dst[:]),
-		DstMask:               tcpip.Address(iptip.DstMask[:]),
-		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
-		Src:                   tcpip.Address(iptip.Src[:]),
-		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
-		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
-		OutputInterface:       ifname,
-		OutputInterfaceMask:   ifnameMask,
-		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
-	}, nil
-}
-
-func containsUnsupportedFields(iptip linux.IPTIP) bool {
-	// The following features are supported:
-	// - Protocol
-	// - Dst and DstMask
-	// - Src and SrcMask
-	// - The inverse destination IP check flag
-	// - OutputInterface, OutputInterfaceMask and its inverse.
-	var emptyInterface = [linux.IFNAMSIZ]byte{}
-	// Disable any supported inverse flags.
-	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
-	return iptip.InputInterface != emptyInterface ||
-		iptip.InputInterfaceMask != emptyInterface ||
-		iptip.Flags != 0 ||
-		iptip.InverseFlags&^inverseMask != 0
-}
-
-func validUnderflow(rule stack.Rule) bool {
+func validUnderflow(rule stack.Rule, ipv6 bool) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
-	if rule.Filter != emptyFilter {
+	if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) {
 		return false
 	}
 	switch rule.Target.(type) {
-	case stack.AcceptTarget, stack.DropTarget:
+	case *stack.AcceptTarget, *stack.DropTarget:
 		return true
 	default:
 		return false
 	}
 }
 
-func isUnconditionalAccept(rule stack.Rule) bool {
-	if !validUnderflow(rule) {
+func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
+	if !validUnderflow(rule, ipv6) {
 		return false
 	}
-	_, ok := rule.Target.(stack.AcceptTarget)
+	_, ok := rule.Target.(*stack.AcceptTarget)
 	return ok
 }
 
@@ -520,3 +338,20 @@ func hookFromLinux(hook int) stack.Hook {
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
+
+// TargetRevision returns a linux.XTGetRevision for a given target. It sets
+// Revision to the highest supported value, unless the provided revision number
+// is larger.
+func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) {
+	// Read in the target name and version.
+	var rev linux.XTGetRevision
+	if _, err := rev.CopyIn(t, revPtr); err != nil {
+		return linux.XTGetRevision{}, syserr.FromError(err)
+	}
+	maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision)
+	if !ok {
+		return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported
+	}
+	rev.Revision = maxSupported
+	return rev, nil
+}
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 8ebdaff18..0e14447fe 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -15,255 +15,357 @@
 package netfilter
 
 import (
-	"errors"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// errorTargetName is used to mark targets as error targets. Error targets
-// shouldn't be reached - an error has occurred if we fall through to one.
-const errorTargetName = "ERROR"
+func init() {
+	// Standard targets include ACCEPT, DROP, RETURN, and JUMP.
+	registerTargetMaker(&standardTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&standardTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+
+	// Both user chains and actual errors are represented in iptables by
+	// error targets.
+	registerTargetMaker(&errorTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&errorTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+
+	registerTargetMaker(&redirectTargetMaker{
+		NetworkProtocol: header.IPv4ProtocolNumber,
+	})
+	registerTargetMaker(&nfNATTargetMaker{
+		NetworkProtocol: header.IPv6ProtocolNumber,
+	})
+}
 
-// redirectTargetName is used to mark targets as redirect targets. Redirect
-// targets should be reached for only NAT and Mangle tables. These targets will
-// change the destination port/destination IP for packets.
-const redirectTargetName = "REDIRECT"
+type standardTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
 
-func marshalTarget(target stack.Target) []byte {
+func (sm *standardTargetMaker) id() stack.TargetID {
+	// Standard targets have the empty string as a name and no revisions.
+	return stack.TargetID{
+		NetworkProtocol: sm.NetworkProtocol,
+	}
+}
+func (*standardTargetMaker) marshal(target stack.Target) []byte {
+	// Translate verdicts the same way as the iptables tool.
+	var verdict int32
 	switch tg := target.(type) {
-	case stack.AcceptTarget:
-		return marshalStandardTarget(stack.RuleAccept)
-	case stack.DropTarget:
-		return marshalStandardTarget(stack.RuleDrop)
-	case stack.ErrorTarget:
-		return marshalErrorTarget(errorTargetName)
-	case stack.UserChainTarget:
-		return marshalErrorTarget(tg.Name)
-	case stack.ReturnTarget:
-		return marshalStandardTarget(stack.RuleReturn)
-	case stack.RedirectTarget:
-		return marshalRedirectTarget(tg)
-	case JumpTarget:
-		return marshalJumpTarget(tg)
+	case *stack.AcceptTarget:
+		verdict = -linux.NF_ACCEPT - 1
+	case *stack.DropTarget:
+		verdict = -linux.NF_DROP - 1
+	case *stack.ReturnTarget:
+		verdict = linux.NF_RETURN
+	case *JumpTarget:
+		verdict = int32(tg.Offset)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
-}
-
-func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
-	target := linux.XTStandardTarget{
+	xt := linux.XTStandardTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTStandardTarget,
 		},
-		Verdict: translateFromStandardVerdict(verdict),
+		Verdict: verdict,
 	}
 
 	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) != linux.SizeOfXTStandardTarget {
+		nflog("buf has wrong size for standard target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+	var standardTarget linux.XTStandardTarget
+	buf = buf[:linux.SizeOfXTStandardTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+	if standardTarget.Verdict < 0 {
+		// A Verdict < 0 indicates a non-jump verdict.
+		return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol())
+	}
+	// A verdict >= 0 indicates a jump.
+	return &JumpTarget{
+		Offset:          uint32(standardTarget.Verdict),
+		NetworkProtocol: filter.NetworkProtocol(),
+	}, nil
+}
+
+type errorTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (em *errorTargetMaker) id() stack.TargetID {
+	// Error targets have no revision.
+	return stack.TargetID{
+		Name:            stack.ErrorTargetName,
+		NetworkProtocol: em.NetworkProtocol,
+	}
 }
 
-func marshalErrorTarget(errorName string) []byte {
+func (*errorTargetMaker) marshal(target stack.Target) []byte {
+	var errorName string
+	switch tg := target.(type) {
+	case *stack.ErrorTarget:
+		errorName = stack.ErrorTargetName
+	case *stack.UserChainTarget:
+		errorName = tg.Name
+	default:
+		panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
+	}
+
 	// This is an error target named error
-	target := linux.XTErrorTarget{
+	xt := linux.XTErrorTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTErrorTarget,
 		},
 	}
-	copy(target.Name[:], errorName)
-	copy(target.Target.Name[:], errorTargetName)
+	copy(xt.Name[:], errorName)
+	copy(xt.Target.Name[:], stack.ErrorTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
 }
 
-func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
+func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) != linux.SizeOfXTErrorTarget {
+		nflog("buf has insufficient size for error target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+	var errorTarget linux.XTErrorTarget
+	buf = buf[:linux.SizeOfXTErrorTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+	// Error targets are used in 2 cases:
+	// * An actual error case. These rules have an error
+	//   named stack.ErrorTargetName. The last entry of the table
+	//   is usually an error case to catch any packets that
+	//   somehow fall through every rule.
+	// * To mark the start of a user defined chain. These
+	//   rules have an error with the name of the chain.
+	switch name := errorTarget.Name.String(); name {
+	case stack.ErrorTargetName:
+		return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil
+	default:
+		// User defined chain.
+		return &stack.UserChainTarget{
+			Name:            name,
+			NetworkProtocol: filter.NetworkProtocol(),
+		}, nil
+	}
+}
+
+type redirectTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *redirectTargetMaker) id() stack.TargetID {
+	return stack.TargetID{
+		Name:            stack.RedirectTargetName,
+		NetworkProtocol: rm.NetworkProtocol,
+	}
+}
+
+func (*redirectTargetMaker) marshal(target stack.Target) []byte {
+	rt := target.(*stack.RedirectTarget)
 	// This is a redirect target named redirect
-	target := linux.XTRedirectTarget{
+	xt := linux.XTRedirectTarget{
 		Target: linux.XTEntryTarget{
 			TargetSize: linux.SizeOfXTRedirectTarget,
 		},
 	}
-	copy(target.Target.Name[:], redirectTargetName)
+	copy(xt.Target.Name[:], stack.RedirectTargetName)
 
 	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
-	target.NfRange.RangeSize = 1
-	if rt.RangeProtoSpecified {
-		target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+	xt.NfRange.RangeSize = 1
+	xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+	xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
+	xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
+	return binary.Marshal(ret, usermem.ByteOrder, xt)
+}
+
+func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if len(buf) < linux.SizeOfXTRedirectTarget {
+		nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+		nflog("redirectTargetMaker: bad proto %d", p)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	var redirectTarget linux.XTRedirectTarget
+	buf = buf[:linux.SizeOfXTRedirectTarget]
+	binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+	// Copy linux.XTRedirectTarget to stack.RedirectTarget.
+	target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()}
+
+	// RangeSize should be 1.
+	nfRange := redirectTarget.NfRange
+	if nfRange.RangeSize != 1 {
+		nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/170): Check if the flags are valid.
+	// Also check if we need to map ports or IP.
+	// For now, redirect target only supports destination port change.
+	// Port range and IP range are not supported yet.
+	if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+		nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/170): Port range is not supported yet.
+	if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+		nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+		return nil, syserr.ErrInvalidArgument
 	}
-	// Convert port from little endian to big endian.
-	port := make([]byte, 2)
-	binary.LittleEndian.PutUint16(port, rt.MinPort)
-	target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
-	binary.LittleEndian.PutUint16(port, rt.MaxPort)
-	target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
+		nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+	target.Port = ntohs(nfRange.RangeIPV4.MinPort)
+
+	return &target, nil
 }
 
-func marshalJumpTarget(jt JumpTarget) []byte {
-	nflog("convert to binary: marshalling jump target")
+type nfNATTarget struct {
+	Target linux.XTEntryTarget
+	Range  linux.NFNATRange
+}
 
-	// The target's name will be the empty string.
-	target := linux.XTStandardTarget{
+const nfNATMarhsalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange
+
+type nfNATTargetMaker struct {
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+func (rm *nfNATTargetMaker) id() stack.TargetID {
+	return stack.TargetID{
+		Name:            stack.RedirectTargetName,
+		NetworkProtocol: rm.NetworkProtocol,
+	}
+}
+
+func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
+	rt := target.(*stack.RedirectTarget)
+	nt := nfNATTarget{
 		Target: linux.XTEntryTarget{
-			TargetSize: linux.SizeOfXTStandardTarget,
+			TargetSize: nfNATMarhsalledSize,
+		},
+		Range: linux.NFNATRange{
+			Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
 		},
-		// Verdict is overloaded by the ABI. When positive, it holds
-		// the jump offset from the start of the table.
-		Verdict: int32(jt.Offset),
 	}
+	copy(nt.Target.Name[:], stack.RedirectTargetName)
+	copy(nt.Range.MinAddr[:], rt.Addr)
+	copy(nt.Range.MaxAddr[:], rt.Addr)
 
-	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
-	return binary.Marshal(ret, usermem.ByteOrder, target)
+	nt.Range.MinProto = htons(rt.Port)
+	nt.Range.MaxProto = nt.Range.MinProto
+
+	ret := make([]byte, 0, nfNATMarhsalledSize)
+	return binary.Marshal(ret, usermem.ByteOrder, nt)
 }
 
-// translateFromStandardVerdict translates verdicts the same way as the iptables
-// tool.
-func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
-	switch verdict {
-	case stack.RuleAccept:
-		return -linux.NF_ACCEPT - 1
-	case stack.RuleDrop:
-		return -linux.NF_DROP - 1
-	case stack.RuleReturn:
-		return linux.NF_RETURN
-	default:
-		// TODO(gvisor.dev/issue/170): Support Jump.
-		panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+	if size := nfNATMarhsalledSize; len(buf) < size {
+		nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
+		return nil, syserr.ErrInvalidArgument
 	}
+
+	if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
+		nflog("nfNATTargetMaker: bad proto %d", p)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	var natRange linux.NFNATRange
+	buf = buf[linux.SizeOfXTEntryTarget:nfNATMarhsalledSize]
+	binary.Unmarshal(buf, usermem.ByteOrder, &natRange)
+
+	// We don't support port or address ranges.
+	if natRange.MinAddr != natRange.MaxAddr {
+		nflog("nfNATTargetMaker: MinAddr and MaxAddr are different")
+		return nil, syserr.ErrInvalidArgument
+	}
+	if natRange.MinProto != natRange.MaxProto {
+		nflog("nfNATTargetMaker: MinProto and MaxProto are different")
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// TODO(gvisor.dev/issue/3549): Check for other flags.
+	// For now, redirect target only supports destination change.
+	if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
+		nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags)
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	target := stack.RedirectTarget{
+		NetworkProtocol: filter.NetworkProtocol(),
+		Addr:            tcpip.Address(natRange.MinAddr[:]),
+		Port:            ntohs(natRange.MinProto),
+	}
+
+	return &target, nil
 }
 
 // translateToStandardTarget translates from the value in a
 // linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32) (stack.Target, error) {
+func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return stack.AcceptTarget{}, nil
+		return &stack.AcceptTarget{NetworkProtocol: netProto}, nil
 	case -linux.NF_DROP - 1:
-		return stack.DropTarget{}, nil
+		return &stack.DropTarget{NetworkProtocol: netProto}, nil
 	case -linux.NF_QUEUE - 1:
-		return nil, errors.New("unsupported iptables verdict QUEUE")
+		nflog("unsupported iptables verdict QUEUE")
+		return nil, syserr.ErrInvalidArgument
 	case linux.NF_RETURN:
-		return stack.ReturnTarget{}, nil
+		return &stack.ReturnTarget{NetworkProtocol: netProto}, nil
 	default:
-		return nil, fmt.Errorf("unknown iptables verdict %d", val)
+		nflog("unknown iptables verdict %d", val)
+		return nil, syserr.ErrInvalidArgument
 	}
 }
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
-		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
+		nflog("optVal has insufficient size for entry target %d", len(optVal))
+		return nil, syserr.ErrInvalidArgument
 	}
 	var target linux.XTEntryTarget
 	buf := optVal[:linux.SizeOfXTEntryTarget]
 	binary.Unmarshal(buf, usermem.ByteOrder, &target)
-	switch target.Name.String() {
-	case "":
-		// Standard target.
-		if len(optVal) != linux.SizeOfXTStandardTarget {
-			return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal))
-		}
-		var standardTarget linux.XTStandardTarget
-		buf = optVal[:linux.SizeOfXTStandardTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
-
-		if standardTarget.Verdict < 0 {
-			// A Verdict < 0 indicates a non-jump verdict.
-			return translateToStandardTarget(standardTarget.Verdict)
-		}
-		// A verdict >= 0 indicates a jump.
-		return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
-
-	case errorTargetName:
-		// Error target.
-		if len(optVal) != linux.SizeOfXTErrorTarget {
-			return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal))
-		}
-		var errorTarget linux.XTErrorTarget
-		buf = optVal[:linux.SizeOfXTErrorTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
-
-		// Error targets are used in 2 cases:
-		// * An actual error case. These rules have an error
-		//   named errorTargetName. The last entry of the table
-		//   is usually an error case to catch any packets that
-		//   somehow fall through every rule.
-		// * To mark the start of a user defined chain. These
-		//   rules have an error with the name of the chain.
-		switch name := errorTarget.Name.String(); name {
-		case errorTargetName:
-			nflog("set entries: error target")
-			return stack.ErrorTarget{}, nil
-		default:
-			// User defined chain.
-			nflog("set entries: user-defined target %q", name)
-			return stack.UserChainTarget{Name: name}, nil
-		}
-
-	case redirectTargetName:
-		// Redirect target.
-		if len(optVal) < linux.SizeOfXTRedirectTarget {
-			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
-		}
-
-		if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		var redirectTarget linux.XTRedirectTarget
-		buf = optVal[:linux.SizeOfXTRedirectTarget]
-		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
-
-		// Copy linux.XTRedirectTarget to stack.RedirectTarget.
-		var target stack.RedirectTarget
-		nfRange := redirectTarget.NfRange
-
-		// RangeSize should be 1.
-		if nfRange.RangeSize != 1 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
-		// Also check if we need to map ports or IP.
-		// For now, redirect target only supports destination port change.
-		// Port range and IP range are not supported yet.
-		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-		target.RangeProtoSpecified = true
-
-		target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
-		target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
-
-		// TODO(gvisor.dev/issue/170): Port range is not supported yet.
-		if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
-			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
-		}
-
-		// Convert port from big endian to little endian.
-		port := make([]byte, 2)
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
-		target.MinPort = binary.LittleEndian.Uint16(port)
-
-		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
-		target.MaxPort = binary.LittleEndian.Uint16(port)
-		return target, nil
-	}
 
-	// Unknown target.
-	return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet", target.Name.String())
+	return unmarshalTarget(target, filter, optVal)
 }
 
 // JumpTarget implements stack.Target.
@@ -274,9 +376,31 @@ type JumpTarget struct {
 
 	// RuleNum is the rule to jump to.
 	RuleNum int
+
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (jt *JumpTarget) ID() stack.TargetID {
+	return stack.TargetID{
+		NetworkProtocol: jt.NetworkProtocol,
+	}
 }
 
 // Action implements stack.Target.Action.
-func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
 	return stack.RuleJump, jt.RuleNum
 }
+
+func ntohs(port uint16) uint16 {
+	buf := make([]byte, 2)
+	binary.BigEndian.PutUint16(buf, port)
+	return usermem.ByteOrder.Uint16(buf)
+}
+
+func htons(port uint16) uint16 {
+	buf := make([]byte, 2)
+	usermem.ByteOrder.PutUint16(buf, port)
+	return binary.BigEndian.Uint16(buf)
+}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 0bfd6c1f4..352c51390 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -71,7 +71,7 @@ func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
 	}
 
 	if filter.Protocol != header.TCPProtocolNumber {
-		return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber)
+		return nil, fmt.Errorf("TCP matching is only valid for protocol %d", header.TCPProtocolNumber)
 	}
 
 	return &TCPMatcher{
@@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string {
 
 // Match implements Matcher.Match.
 func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+	// into the stack.Check codepath as matchers are added.
+	switch pkt.NetworkProtocolNumber {
+	case header.IPv4ProtocolNumber:
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+			return false, false
+		}
 
-	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return false, false
-	}
+		// We don't match fragments.
+		if frag := netHeader.FragmentOffset(); frag != 0 {
+			if frag == 1 {
+				return false, true
+			}
+			return false, false
+		}
 
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			return false, true
+	case header.IPv6ProtocolNumber:
+		// As in Linux, we do not perform an IPv6 fragment check. See
+		// xt_action_param.fragoff in
+		// include/linux/netfilter/x_tables.h.
+		if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+			return false, false
 		}
+
+	default:
+		// We don't know the network protocol.
 		return false, false
 	}
 
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 7ed05461d..c88d8268d 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -68,7 +68,7 @@ func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma
 	}
 
 	if filter.Protocol != header.UDPProtocolNumber {
-		return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber)
+		return nil, fmt.Errorf("UDP matching is only valid for protocol %d", header.UDPProtocolNumber)
 	}
 
 	return &UDPMatcher{
@@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string {
 
 // Match implements Matcher.Match.
 func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
 	// into the stack.Check codepath as matchers are added.
-	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
-		return false, false
-	}
+	switch pkt.NetworkProtocolNumber {
+	case header.IPv4ProtocolNumber:
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+			return false, false
+		}
 
-	// We dont't match fragments.
-	if frag := netHeader.FragmentOffset(); frag != 0 {
-		if frag == 1 {
-			return false, true
+		// We don't match fragments.
+		if frag := netHeader.FragmentOffset(); frag != 0 {
+			if frag == 1 {
+				return false, true
+			}
+			return false, false
 		}
+
+	case header.IPv6ProtocolNumber:
+		// As in Linux, we do not perform an IPv6 fragment check. See
+		// xt_action_param.fragoff in
+		// include/linux/netfilter/x_tables.h.
+		if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+			return false, false
+		}
+
+	default:
+		// We don't know the network protocol.
 		return false, false
 	}
 
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 0546801bf..1f926aa91 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -16,6 +16,8 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -36,8 +38,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
 
diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go
index bb205be0d..f061c5d62 100644
--- a/pkg/sentry/socket/netlink/provider_vfs2.go
+++ b/pkg/sentry/socket/netlink/provider_vfs2.go
@@ -51,7 +51,8 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol
 
 	vfsfd := &s.vfsfd
 	mnt := t.Kernel().SocketMount()
-	d := sockfs.NewDentry(t.Credentials(), mnt)
+	d := sockfs.NewDentry(t, mnt)
+	defer d.DecRef(t)
 	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
 		DenyPWrite:        true,
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index c84d8bd7c..22216158e 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -36,9 +36,9 @@ type commandKind int
 
 const (
 	kindNew commandKind = 0x0
-	kindDel             = 0x1
-	kindGet             = 0x2
-	kindSet             = 0x3
+	kindDel commandKind = 0x1
+	kindGet commandKind = 0x2
+	kindSet commandKind = 0x3
 )
 
 func typeKind(typ uint16) commandKind {
@@ -423,6 +423,11 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin
 		}
 		attrs = rest
 
+		// NOTE: A netlink message will contain multiple header attributes.
+		// Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
+		// with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
+		// local interface address. We add the local interface address here
+		// and ignore the IFA_ADDRESS.
 		switch ahdr.Type {
 		case linux.IFA_LOCAL:
 			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
@@ -439,8 +444,57 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin
 			} else if err != nil {
 				return syserr.ErrInvalidArgument
 			}
+		case linux.IFA_ADDRESS:
+		default:
+			return syserr.ErrNotSupported
+		}
+	}
+	return nil
+}
+
+// delAddr handles RTM_DELADDR requests.
+func (p *Protocol) delAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		// NOTE: A netlink message will contain multiple header attributes.
+		// Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
+		// with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
+		// local interface address. We use the local interface address to
+		// remove the address and ignore the IFA_ADDRESS.
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.RemoveInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		case linux.IFA_ADDRESS:
+		default:
+			return syserr.ErrNotSupported
 		}
 	}
+
 	return nil
 }
 
@@ -485,6 +539,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms
 			return p.dumpRoutes(ctx, msg, ms)
 		case linux.RTM_NEWADDR:
 			return p.newAddr(ctx, msg, ms)
+		case linux.RTM_DELADDR:
+			return p.delAddr(ctx, msg, ms)
 		default:
 			return syserr.ErrNotSupported
 		}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 68a9b9a96..3baad098b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,11 +16,14 @@
 package netlink
 
 import (
+	"io"
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -38,8 +41,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 const sizeOfInt32 int = 4
@@ -748,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t
 
 	buf := make([]byte, src.NumBytes())
 	n, err := src.CopyIn(ctx, buf)
+	// io.EOF can be only returned if src is a file, this means that
+	// sendMsg is called from splice and the error has to be ignored in
+	// this case.
+	if err == io.EOF {
+		err = nil
+	}
 	if err != nil {
 		// Don't partially consume messages.
 		return 0, syserr.FromError(err)
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index a38d25da9..461d524e5 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -37,6 +37,8 @@ import (
 // to/from the kernel.
 //
 // SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer.
+//
+// +stateify savable
 type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -82,6 +84,13 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 	return fd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 1fb777a6c..fae3b6783 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -22,6 +22,8 @@ go_library(
         "//pkg/binary",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/safemem",
         "//pkg/sentry/arch",
@@ -51,8 +53,6 @@ go_library(
         "//pkg/tcpip/transport/udp",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e4846bc0b..86c634715 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -40,6 +40,8 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -62,8 +64,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 func mustCreateMetric(name, description string) *tcpip.StatCounter {
@@ -158,6 +158,9 @@ var Metrics = tcpip.Stats{
 		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
 		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
 		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."),
+		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
+		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
 	},
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
@@ -195,7 +198,6 @@ var Metrics = tcpip.Stats{
 		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
 		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
 		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
-		InvalidSourceAddress:     mustCreateMetric("/netstack/udp/invalid_source", "Number of UDP datagrams dropped due to invalid source address."),
 	},
 }
 
@@ -236,7 +238,7 @@ type commonEndpoint interface {
 
 	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
 	// transport.Endpoint.SetSockOpt.
-	SetSockOpt(interface{}) *tcpip.Error
+	SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
 	// transport.Endpoint.SetSockOptBool.
@@ -248,7 +250,7 @@ type commonEndpoint interface {
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
 	// transport.Endpoint.GetSockOpt.
-	GetSockOpt(interface{}) *tcpip.Error
+	GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
 	// transport.Endpoint.GetSockOpt.
@@ -257,6 +259,9 @@ type commonEndpoint interface {
 	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
 	// transport.Endpoint.GetSockOpt.
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // LINT.IfChange
@@ -479,8 +484,35 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(context.Context) {
+func (s *socketOpsCommon) Release(ctx context.Context) {
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
+	defer s.EventUnregister(&e)
+
 	s.Endpoint.Close()
+
+	// SO_LINGER option is valid only for TCP. For other socket types
+	// return after endpoint close.
+	if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
+		return
+	}
+
+	var v tcpip.LingerOption
+	if err := s.Endpoint.GetSockOpt(&v); err != nil {
+		return
+	}
+
+	// The case for zero timeout is handled in tcp endpoint close function.
+	// Close is blocked until either:
+	// 1. The endpoint state is not in any of the states: FIN-WAIT1,
+	// CLOSING and LAST_ACK.
+	// 2. Timeout is reached.
+	if v.Enabled && v.Timeout != 0 {
+		t := kernel.TaskFromContext(ctx)
+		start := t.Kernel().MonotonicClock().Now()
+		deadline := start.Add(v.Timeout)
+		t.BlockWithDeadline(ch, true, deadline)
+	}
 }
 
 // Read implements fs.FileOperations.Read.
@@ -555,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) {
 	}
 	v := buffer.NewView(size)
 	if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+		// EOF can be returned only if src is a file and this means it
+		// is in a splice syscall and the error has to be ignored.
+		if err == io.EOF {
+			return v, nil
+		}
 		return nil, tcpip.ErrBadAddress
 	}
 	return v, nil
@@ -803,7 +840,20 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	}
 
 	// Issue the bind request to the endpoint.
-	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
+	err := s.Endpoint.Bind(addr)
+	if err == tcpip.ErrNoPortAvailable {
+		// Bind always returns EADDRINUSE irrespective of if the specified port was
+		// already bound or if an ephemeral port was requested but none were
+		// available.
+		//
+		// tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
+		// UDP connect returns EAGAIN on ephemeral port exhaustion.
+		//
+		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
+		err = tcpip.ErrPortInUse
+	}
+
+	return syserr.TranslateNetstackError(err)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
@@ -814,7 +864,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -823,7 +873,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 	// Try to accept the connection again; if it fails, then wait until we
 	// get a notification.
 	for {
-		if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+		if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock {
 			return ep, wq, syserr.TranslateNetstackError(err)
 		}
 
@@ -836,15 +886,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite
 // Accept implements the linux syscall accept(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -864,13 +917,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -943,47 +991,12 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -992,10 +1005,10 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in
 		return getSockOptTCP(t, ep, name, outLen)
 
 	case linux.SOL_IPV6:
-		return getSockOptIPv6(t, ep, name, outLen)
+		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
 
 	case linux.SOL_IP:
-		return getSockOptIP(t, ep, name, outLen, family)
+		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
@@ -1025,7 +1038,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		}
 
 		// Get the last error and convert it.
-		err := ep.GetSockOpt(tcpip.ErrorOption{})
+		err := ep.LastError()
 		if err == nil {
 			optP := primitive.Int32(0)
 			return &optP, nil
@@ -1176,7 +1189,16 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		linger := linux.Linger{}
+		var v tcpip.LingerOption
+		var linger linux.Linger
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v.Enabled {
+			linger.OnOff = 1
+		}
+		linger.Linger = int32(v.Timeout.Seconds())
 		return &linger, nil
 
 	case linux.SO_SNDTIMEO:
@@ -1222,6 +1244,18 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
 		vP := primitive.Int32(boolToInt32(v))
 		return &vP, nil
 
+	case linux.SO_ACCEPTCONN:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.AcceptConnOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		vP := primitive.Int32(boolToInt32(v))
+		return &vP, nil
+
 	default:
 		socket.GetSockOptEmitUnimplementedEvent(t, name)
 	}
@@ -1390,8 +1424,12 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 		if err := ep.GetSockOpt(&v); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		lingerTimeout := primitive.Int32(time.Duration(v) / time.Second)
+		var lingerTimeout primitive.Int32
+		if v >= 0 {
+			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
+		} else {
+			lingerTimeout = -1
+		}
 		return &lingerTimeout, nil
 
 	case linux.TCP_DEFER_ACCEPT:
@@ -1437,7 +1475,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal
 }
 
 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
-func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if outLen < sizeOfInt32 {
@@ -1490,9 +1528,78 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 		vP := primitive.Int32(boolToInt32(v))
 		return &vP, nil
 
-	case linux.SO_ORIGINAL_DST:
-		// TODO(gvisor.dev/issue/170): ip6tables.
-		return nil, syserr.ErrInvalidArgument
+	case linux.IP6T_ORIGINAL_DST:
+		if outLen < int(binary.Size(linux.SockAddrInet6{})) {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.OriginalDestinationOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
+		return a.(*linux.SockAddrInet6), nil
+
+	case linux.IP6T_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IP6T_SO_GET_ENTRIES:
+		// IPTGetEntries is reused for IPv6.
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
+	case linux.IP6T_SO_GET_REVISION_TARGET:
+		if outLen < linux.SizeOfXTGetRevision {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
+		if err != nil {
+			return nil, err
+		}
+		return &ret, nil
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1501,7 +1608,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (marsha
 }
 
 // getSockOptIP implements GetSockOpt when level is SOL_IP.
-func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
+func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
 	switch name {
 	case linux.IP_TTL:
 		if outLen < sizeOfInt32 {
@@ -1617,6 +1724,66 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
 		return a.(*linux.SockAddrInet), nil
 
+	case linux.IPT_SO_GET_INFO:
+		if outLen < linux.SizeOfIPTGetinfo {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
+		if err != nil {
+			return nil, err
+		}
+		return &info, nil
+
+	case linux.IPT_SO_GET_ENTRIES:
+		if outLen < linux.SizeOfIPTGetEntries {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
+		if err != nil {
+			return nil, err
+		}
+		return &entries, nil
+
+	case linux.IPT_SO_GET_REVISION_TARGET:
+		if outLen < linux.SizeOfXTGetRevision {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return nil, syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return nil, syserr.ErrNoDevice
+		}
+		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
+		if err != nil {
+			return nil, err
+		}
+		return &ret, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1650,26 +1817,6 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
@@ -1684,21 +1831,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
 		return setSockOptTCP(t, ep, name, optVal)
 
 	case linux.SOL_IPV6:
-		return setSockOptIPv6(t, ep, name, optVal)
+		return setSockOptIPv6(t, s, ep, name, optVal)
 
 	case linux.SOL_IP:
-		return setSockOptIP(t, ep, name, optVal)
+		return setSockOptIP(t, s, ep, name, optVal)
+
+	case linux.SOL_PACKET:
+		// gVisor doesn't support any SOL_PACKET options just return not
+		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
+		// features are supported and proceed to use them and break.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return syserr.ErrProtocolNotAvailable
 
 	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
-		linux.SOL_RAW,
-		linux.SOL_PACKET:
+		linux.SOL_RAW:
 
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
@@ -1743,7 +1895,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		name := string(optVal[:n])
 		if name == "" {
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+			v := tcpip.BindToDeviceOption(0)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 		}
 		s := t.NetworkContext()
 		if s == nil {
@@ -1751,7 +1904,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 		}
 		for nicID, nic := range s.Interfaces() {
 			if nic.Name == name {
-				return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+				v := tcpip.BindToDeviceOption(nicID)
+				return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 			}
 		}
 		return syserr.ErrUnknownDevice
@@ -1817,7 +1971,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+		opt := tcpip.OutOfBandInlineOption(v)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.SO_NO_CHECK:
 		if len(optVal) < sizeOfInt32 {
@@ -1839,19 +1994,21 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
 
-		return nil
+		return syserr.TranslateNetstackError(
+			ep.SetSockOpt(&tcpip.LingerOption{
+				Enabled: v.OnOff != 0,
+				Timeout: time.Second * time.Duration(v.Linger)}))
 
 	case linux.SO_DETACH_FILTER:
 		// optval is ignored.
 		var v tcpip.SocketDetachFilterOption
-		return syserr.TranslateNetstackError(ep.SetSockOpt(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
 
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
@@ -1898,7 +2055,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPINTVL:
 		if len(optVal) < sizeOfInt32 {
@@ -1909,7 +2067,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_KEEPCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -1931,11 +2090,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
-		if err := ep.SetSockOpt(v); err != nil {
+		if err := ep.SetSockOpt(&v); err != nil {
 			return syserr.TranslateNetstackError(err)
 		}
 		return nil
@@ -1945,8 +2105,9 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 			return syserr.ErrInvalidArgument
 		}
 
-		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_DEFER_ACCEPT:
 		if len(optVal) < sizeOfInt32 {
@@ -1956,7 +2117,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		if v < 0 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
 
 	case linux.TCP_SYNCNT:
 		if len(optVal) < sizeOfInt32 {
@@ -1981,12 +2143,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		emitUnimplementedEventTCP(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
-func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IPV6_V6ONLY:
 		if len(optVal) < sizeOfInt32 {
@@ -2035,12 +2196,32 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
 
+	case linux.IP6T_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIP6TReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv6 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true)
+
+	case linux.IP6T_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 var (
@@ -2095,7 +2276,7 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
 }
 
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
-func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IP_MULTICAST_TTL:
 		v, err := parseIntOrChar(optVal)
@@ -2118,7 +2299,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change AddMembership to use the standard
 			// any address representation.
@@ -2132,7 +2313,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
 			// TODO(igudger): Change DropMembership to use the standard
 			// any address representation.
@@ -2146,7 +2327,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
+		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
 			NIC:           tcpip.NICID(req.InterfaceIndex),
 			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
 		}))
@@ -2215,6 +2396,27 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0))
 
+	case linux.IPT_SO_SET_REPLACE:
+		if len(optVal) < linux.SizeOfIPTReplace {
+			return syserr.ErrInvalidArgument
+		}
+
+		// Only valid for raw IPv4 sockets.
+		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
+			return syserr.ErrProtocolNotAvailable
+		}
+
+		stack := inet.StackFromContext(t)
+		if stack == nil {
+			return syserr.ErrNoDevice
+		}
+		// Stack must be a netstack stack.
+		return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false)
+
+	case linux.IPT_SO_SET_ADD_COUNTERS:
+		// TODO(gvisor.dev/issue/170): Counter support.
+		return nil
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -2249,8 +2451,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
 
-	// Default to the old behavior; hand off to network stack.
-	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+	return nil
 }
 
 // emitUnimplementedEventTCP emits unimplemented event if name is valid. This
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 3335e7430..b0d9e4d9e 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -18,25 +18,25 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // SocketVFS2 encapsulates all the state needed to represent a network stack
 // endpoint in the kernel context.
+//
+// +stateify savable
 type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -57,7 +57,8 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 	}
 
 	mnt := t.Kernel().SocketMount()
-	d := sockfs.NewDentry(t.Credentials(), mnt)
+	d := sockfs.NewDentry(t, mnt)
+	defer d.DecRef(t)
 
 	s := &SocketVFS2{
 		socketOpsCommon: socketOpsCommon{
@@ -80,6 +81,13 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 	return vfsfd, nil
 }
 
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	t := kernel.TaskFromContext(ctx)
+	t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+	s.socketOpsCommon.Release(ctx)
+}
+
 // Readiness implements waiter.Waitable.Readiness.
 func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.socketOpsCommon.Readiness(mask)
@@ -152,14 +160,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
 // tcpip.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
-	ep, wq, terr := s.Endpoint.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, wq, terr := s.Endpoint.Accept(peerAddr)
 	if terr != nil {
 		if terr != tcpip.ErrWouldBlock || !blocking {
 			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
-		ep, wq, err = s.blockingAccept(t)
+		ep, wq, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -177,13 +189,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
+	if peerAddr != nil {
 		// Get address of the peer and write it to peer slice.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+		addr, addrLen = ConvertAddress(s.family, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -233,42 +241,7 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.
 		return &val, nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_GET_INFO:
-			if outLen < linux.SizeOfIPTGetinfo {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
-			if err != nil {
-				return nil, err
-			}
-			return &info, nil
-
-		case linux.IPT_SO_GET_ENTRIES:
-			if outLen < linux.SizeOfIPTGetEntries {
-				return nil, syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return nil, syserr.ErrNoDevice
-			}
-			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
-			if err != nil {
-				return nil, err
-			}
-			return &entries, nil
-
-		}
-	}
-
-	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
 }
 
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
@@ -298,26 +271,6 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by
 		return nil
 	}
 
-	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
-		switch name {
-		case linux.IPT_SO_SET_REPLACE:
-			if len(optVal) < linux.SizeOfIPTReplace {
-				return syserr.ErrInvalidArgument
-			}
-
-			stack := inet.StackFromContext(t)
-			if stack == nil {
-				return syserr.ErrNoDevice
-			}
-			// Stack must be a netstack stack.
-			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
-
-		case linux.IPT_SO_SET_ADD_COUNTERS:
-			// TODO(gvisor.dev/issue/170): Counter support.
-			return nil
-		}
-	}
-
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f0fe18684..fa9ac9059 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -100,62 +100,107 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
-// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
-func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+// convertAddr converts an InterfaceAddr to a ProtocolAddress.
+func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) {
 	var (
-		protocol tcpip.NetworkProtocolNumber
-		address  tcpip.Address
+		protocol        tcpip.NetworkProtocolNumber
+		address         tcpip.Address
+		protocolAddress tcpip.ProtocolAddress
 	)
 	switch addr.Family {
 	case linux.AF_INET:
-		if len(addr.Addr) < header.IPv4AddressSize {
-			return syserror.EINVAL
+		if len(addr.Addr) != header.IPv4AddressSize {
+			return protocolAddress, syserror.EINVAL
 		}
 		if addr.PrefixLen > header.IPv4AddressSize*8 {
-			return syserror.EINVAL
+			return protocolAddress, syserror.EINVAL
 		}
 		protocol = ipv4.ProtocolNumber
-		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
-
+		address = tcpip.Address(addr.Addr)
 	case linux.AF_INET6:
-		if len(addr.Addr) < header.IPv6AddressSize {
-			return syserror.EINVAL
+		if len(addr.Addr) != header.IPv6AddressSize {
+			return protocolAddress, syserror.EINVAL
 		}
 		if addr.PrefixLen > header.IPv6AddressSize*8 {
-			return syserror.EINVAL
+			return protocolAddress, syserror.EINVAL
 		}
 		protocol = ipv6.ProtocolNumber
-		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
-
+		address = tcpip.Address(addr.Addr)
 	default:
-		return syserror.ENOTSUP
+		return protocolAddress, syserror.ENOTSUP
 	}
 
-	protocolAddress := tcpip.ProtocolAddress{
+	protocolAddress = tcpip.ProtocolAddress{
 		Protocol: protocol,
 		AddressWithPrefix: tcpip.AddressWithPrefix{
 			Address:   address,
 			PrefixLen: int(addr.PrefixLen),
 		},
 	}
+	return protocolAddress, nil
+}
+
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	protocolAddress, err := convertAddr(addr)
+	if err != nil {
+		return err
+	}
 
 	// Attach address to interface.
-	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+	nicID := tcpip.NICID(idx)
+	if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
 		return syserr.TranslateNetstackError(err).ToError()
 	}
 
-	// Add route for local network.
-	s.Stack.AddRoute(tcpip.Route{
+	// Add route for local network if it doesn't exist already.
+	localRoute := tcpip.Route{
 		Destination: protocolAddress.AddressWithPrefix.Subnet(),
 		Gateway:     "", // No gateway for local network.
-		NIC:         tcpip.NICID(idx),
+		NIC:         nicID,
+	}
+
+	for _, rt := range s.Stack.GetRouteTable() {
+		if rt.Equal(localRoute) {
+			return nil
+		}
+	}
+
+	// Local route does not exist yet. Add it.
+	s.Stack.AddRoute(localRoute)
+
+	return nil
+}
+
+// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr.
+func (s *Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	protocolAddress, err := convertAddr(addr)
+	if err != nil {
+		return err
+	}
+
+	// Remove addresses matching the address and prefix.
+	nicID := tcpip.NICID(idx)
+	if err := s.Stack.RemoveAddress(nicID, protocolAddress.AddressWithPrefix.Address); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Remove the corresponding local network route if it exists.
+	localRoute := tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         nicID,
+	}
+	s.Stack.RemoveRoutes(func(rt tcpip.Route) bool {
+		return rt.Equal(localRoute)
 	})
+
 	return nil
 }
 
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
-	var rs tcp.ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
 	return inet.TCPBufferSize{
 		Min:     rs.Min,
@@ -166,17 +211,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
 func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
-	rs := tcp.ReceiveBufferSizeOption{
+	rs := tcpip.TCPReceiveBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
 }
 
 // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
 func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
-	var ss tcp.SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
 	return inet.TCPBufferSize{
 		Min:     ss.Min,
@@ -187,29 +232,30 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
 
 // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
 func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
-	ss := tcp.SendBufferSizeOption{
+	ss := tcpip.TCPSendBufferSizeRangeOption{
 		Min:     size.Min,
 		Default: size.Default,
 		Max:     size.Max,
 	}
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
 }
 
 // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
 func (s *Stack) TCPSACKEnabled() (bool, error) {
-	var sack tcp.SACKEnabled
+	var sack tcpip.TCPSACKEnabled
 	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
 	return bool(sack), syserr.TranslateNetstackError(err).ToError()
 }
 
 // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
 func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+	opt := tcpip.TCPSACKEnabled(enabled)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // TCPRecovery implements inet.Stack.TCPRecovery.
 func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
-	var recovery tcp.Recovery
+	var recovery tcpip.TCPRecovery
 	if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
 		return 0, syserr.TranslateNetstackError(err).ToError()
 	}
@@ -218,7 +264,8 @@ func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
 
 // SetTCPRecovery implements inet.Stack.SetTCPRecovery.
 func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
-	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+	opt := tcpip.TCPRecovery(recovery)
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
 }
 
 // Statistics implements inet.Stack.Statistics.
@@ -410,3 +457,24 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
 func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
 	s.Stack.RestoreCleanupEndpoints(es)
 }
+
+// Forwarding implements inet.Stack.Forwarding.
+func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool {
+	switch protocol {
+	case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+		return s.Stack.Forwarding(protocol)
+	default:
+		panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol))
+	}
+}
+
+// SetForwarding implements inet.Stack.SetForwarding.
+func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
+	switch protocol {
+	case ipv4.ProtocolNumber, ipv6.ProtocolNumber:
+		s.Stack.SetForwarding(protocol, enable)
+	default:
+		panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol))
+	}
+	return nil
+}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 04b259d27..fd31479e5 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -35,7 +36,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // ControlMessages represents the union of unix control messages and tcpip
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index cb953e4dc..cce0acc33 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -7,10 +7,21 @@ go_template_instance(
     name = "socket_refs",
     out = "socket_refs.go",
     package = "unix",
-    prefix = "socketOpsCommon",
-    template = "//pkg/refs_vfs2:refs_template",
+    prefix = "socketOperations",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
-        "T": "socketOpsCommon",
+        "T": "SocketOperations",
+    },
+)
+
+go_template_instance(
+    name = "socket_vfs2_refs",
+    out = "socket_vfs2_refs.go",
+    package = "unix",
+    prefix = "socketVFS2",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "SocketVFS2",
     },
 )
 
@@ -20,6 +31,7 @@ go_library(
         "device.go",
         "io.go",
         "socket_refs.go",
+        "socket_vfs2_refs.go",
         "unix.go",
         "unix_vfs2.go",
     ],
@@ -29,7 +41,9 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
@@ -49,6 +63,5 @@ go_library(
         "//pkg/tcpip",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index c708b6030..3ebbd28b0 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "queue_refs",
+    out = "queue_refs.go",
+    package = "transport",
+    prefix = "queue",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "queue",
+    },
+)
+
 go_library(
     name = "transport",
     srcs = [
@@ -22,6 +33,7 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "queue.go",
+        "queue_refs.go",
         "transport_message_list.go",
         "unix.go",
     ],
@@ -32,6 +44,7 @@ go_library(
         "//pkg/ilist",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index c67b602f0..aa4f3c04d 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
 	}
 
 	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
-	q1.EnableLeakCheck("transport.queue")
+	q1.EnableLeakCheck()
 	q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
-	q2.EnableLeakCheck("transport.queue")
+	q2.EnableLeakCheck()
 
 	if stype == linux.SOCK_STREAM {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
 	}
 
 	readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
-	readQueue.EnableLeakCheck("transport.queue")
+	readQueue.EnableLeakCheck()
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
 	}
 
 	writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
-	writeQueue.EnableLeakCheck("transport.queue")
+	writeQueue.EnableLeakCheck()
 	if e.stype == linux.SOCK_STREAM {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
@@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
 }
 
 // Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	e.Lock()
 	defer e.Unlock()
 
@@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
 
 	select {
 	case ne := <-e.acceptedChan:
+		if peerAddr != nil {
+			ne.Lock()
+			c := ne.connected
+			ne.Unlock()
+			if c != nil {
+				addr, err := c.GetLocalAddress()
+				if err != nil {
+					return nil, syserr.TranslateNetstackError(err)
+				}
+				*peerAddr = addr
+			}
+		}
 		return ne, nil
 
 	default:
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 70ee8f9b8..f8aacca13 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -42,7 +42,7 @@ var (
 func NewConnectionless(ctx context.Context) Endpoint {
 	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
 	q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
-	q.EnableLeakCheck("transport.queue")
+	q.EnableLeakCheck()
 	ep.receiver = &queueReceiver{readQueue: &q}
 	return ep
 }
@@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi
 }
 
 // Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+func (*connectionlessEndpoint) Listen(int) *syserr.Error {
 	return syserr.ErrNotSupported
 }
 
 // Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
 	return nil, syserr.ErrNotSupported
 }
 
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index ef6043e19..342def28f 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -16,7 +16,6 @@ package transport
 
 import (
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -28,7 +27,7 @@ import (
 //
 // +stateify savable
 type queue struct {
-	refs.AtomicRefCount
+	queueRefs
 
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
@@ -68,11 +67,13 @@ func (q *queue) Reset(ctx context.Context) {
 	q.mu.Unlock()
 }
 
-// DecRef implements RefCounter.DecRef with destructor q.Reset.
+// DecRef implements RefCounter.DecRef.
 func (q *queue) DecRef(ctx context.Context) {
-	q.DecRefWithDestructor(ctx, q.Reset)
-	// We don't need to notify after resetting because no one cares about
-	// this queue after all references have been dropped.
+	q.queueRefs.DecRef(func() {
+		// We don't need to notify after resetting because no one cares about
+		// this queue after all references have been dropped.
+		q.Reset(ctx)
+	})
 }
 
 // IsReadable determines if q is currently readable.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 475d7177e..b648273a4 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -32,6 +32,8 @@ import (
 const initialLimit = 16 * 1024
 
 // A RightsControlMessage is a control message containing FDs.
+//
+// +stateify savable
 type RightsControlMessage interface {
 	// Clone returns a copy of the RightsControlMessage.
 	Clone() RightsControlMessage
@@ -151,7 +153,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *syserr.Error)
+	//
+	// peerAddr if not nil will be populated with the address of the connected
+	// peer on a successful accept.
+	Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
@@ -172,9 +177,8 @@ type Endpoint interface {
 	// connected.
 	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
 
-	// SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
-	// types.
-	SetSockOpt(opt interface{}) *tcpip.Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error
 
 	// SetSockOptBool sets a socket option for simple cases when a value has
 	// the int type.
@@ -184,9 +188,8 @@ type Endpoint interface {
 	// the int type.
 	SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// tcpip.*Option types.
-	GetSockOpt(opt interface{}) *tcpip.Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error
 
 	// GetSockOptBool gets a socket option for simple cases when a return
 	// value has the int type.
@@ -199,6 +202,9 @@ type Endpoint interface {
 	// State returns the current state of the socket, as represented by Linux in
 	// procfs.
 	State() uint32
+
+	// LastError implements tcpip.Endpoint.LastError.
+	LastError() *tcpip.Error
 }
 
 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -332,7 +338,7 @@ type Receiver interface {
 	RecvMaxQueueSize() int64
 
 	// Release releases any resources owned by the Receiver. It should be
-	// called before droping all references to a Receiver.
+	// called before dropping all references to a Receiver.
 	Release(ctx context.Context)
 }
 
@@ -483,7 +489,7 @@ func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds
 		c := q.control.Clone()
 
 		// Don't consume data since we are peeking.
-		copied, data, _ = vecCopy(data, q.buffer)
+		copied, _, _ = vecCopy(data, q.buffer)
 
 		return copied, copied, c, false, q.addr, notify, nil
 	}
@@ -568,6 +574,12 @@ func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds
 	return copied, copied, c, cmTruncated, q.addr, notify, nil
 }
 
+// Release implements Receiver.Release.
+func (q *streamQueueReceiver) Release(ctx context.Context) {
+	q.queueReceiver.Release(ctx)
+	q.control.Release(ctx)
+}
+
 // A ConnectedEndpoint is an Endpoint that can be used to send Messages.
 type ConnectedEndpoint interface {
 	// Passcred implements Endpoint.Passcred.
@@ -615,7 +627,7 @@ type ConnectedEndpoint interface {
 	SendMaxQueueSize() int64
 
 	// Release releases any resources owned by the ConnectedEndpoint. It should
-	// be called before droping all references to a ConnectedEndpoint.
+	// be called before dropping all references to a ConnectedEndpoint.
 	Release(ctx context.Context)
 
 	// CloseUnread sets the fact that this end is closed with unread data to
@@ -742,6 +754,9 @@ type baseEndpoint struct {
 	// path is not empty if the endpoint has been bound,
 	// or may be used if the endpoint is connected.
 	path string
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -837,8 +852,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 	return n, err
 }
 
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.LingerOption:
+		e.Lock()
+		e.linger = *v
+		e.Unlock()
+	}
 	return nil
 }
 
@@ -866,7 +887,7 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
-	case tcpip.KeepaliveEnabledOption:
+	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
 		return false, nil
 
 	case tcpip.PasscredOption:
@@ -940,9 +961,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.Lock()
+		*o = e.linger
+		e.Unlock()
 		return nil
 
 	default:
@@ -951,6 +975,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
+// LastError implements Endpoint.LastError.
+func (*baseEndpoint) LastError() *tcpip.Error {
+	return nil
+}
+
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index b7e8e4325..adad485a9 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
@@ -39,7 +40,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketOperations is a Unix socket. It is similar to a netstack socket,
@@ -55,6 +55,7 @@ type SocketOperations struct {
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
+	socketOperationsRefs
 	socketOpsCommon
 }
 
@@ -80,15 +81,30 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 		},
 	}
 	s.EnableLeakCheck()
-
 	return fs.NewFile(ctx, d, flags, &s)
 }
 
+// DecRef implements RefCounter.DecRef.
+func (s *SocketOperations) DecRef(ctx context.Context) {
+	s.socketOperationsRefs.DecRef(func() {
+		s.ep.Close(ctx)
+		if s.abstractNamespace != nil {
+			s.abstractNamespace.Remove(s.abstractName, s)
+		}
+	})
+}
+
+// Release implemements fs.FileOperations.Release.
+func (s *SocketOperations) Release(ctx context.Context) {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef(ctx)
+}
+
 // socketOpsCommon contains the socket operations common to VFS1 and VFS2.
 //
 // +stateify savable
 type socketOpsCommon struct {
-	socketOpsCommonRefs
 	socket.SendReceiveTimeout
 
 	ep    transport.Endpoint
@@ -101,23 +117,6 @@ type socketOpsCommon struct {
 	abstractNamespace *kernel.AbstractSocketNamespace
 }
 
-// DecRef implements RefCounter.DecRef.
-func (s *socketOpsCommon) DecRef(ctx context.Context) {
-	s.socketOpsCommonRefs.DecRef(func() {
-		s.ep.Close(ctx)
-		if s.abstractNamespace != nil {
-			s.abstractNamespace.Remove(s.abstractName, s)
-		}
-	})
-}
-
-// Release implemements fs.FileOperations.Release.
-func (s *socketOpsCommon) Release(ctx context.Context) {
-	// Release only decrements a reference on s because s may be referenced in
-	// the abstract socket namespace.
-	s.DecRef(ctx)
-}
-
 func (s *socketOpsCommon) isPacket() bool {
 	switch s.stype {
 	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
@@ -194,7 +193,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
@@ -205,7 +204,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -214,7 +213,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -227,15 +226,18 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -252,13 +254,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
@@ -575,13 +572,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	return dst.CopyOutFrom(ctx, &EndpointReader{
+	r := &EndpointReader{
 		Ctx:       ctx,
 		Endpoint:  s.ep,
 		NumRights: 0,
 		Peek:      false,
 		From:      nil,
-	})
+	}
+	n, err := dst.CopyOutFrom(ctx, r)
+	// Drop control messages.
+	r.Control.Release(ctx)
+	return n, err
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index d066ef8ab..7a78444dc 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
@@ -32,17 +33,19 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // SocketVFS2 implements socket.SocketVFS2 (and by extension,
 // vfs.FileDescriptionImpl) for Unix sockets.
+//
+// +stateify savable
 type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 	vfs.LockFD
 
+	socketVFS2Refs
 	socketOpsCommon
 }
 
@@ -52,7 +55,8 @@ var _ = socket.SocketVFS2(&SocketVFS2{})
 // returns a corresponding file description.
 func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
 	mnt := t.Kernel().SocketMount()
-	d := sockfs.NewDentry(t.Credentials(), mnt)
+	d := sockfs.NewDentry(t, mnt)
+	defer d.DecRef(t)
 
 	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
 	if err != nil {
@@ -76,6 +80,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 			stype: stype,
 		},
 	}
+	sock.EnableLeakCheck()
 	sock.LockFD.Init(locks)
 	vfsfd := &sock.vfsfd
 	if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
@@ -88,15 +93,34 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 	return vfsfd, nil
 }
 
+// DecRef implements RefCounter.DecRef.
+func (s *SocketVFS2) DecRef(ctx context.Context) {
+	s.socketVFS2Refs.DecRef(func() {
+		t := kernel.TaskFromContext(ctx)
+		t.Kernel().DeleteSocketVFS2(&s.vfsfd)
+		s.ep.Close(ctx)
+		if s.abstractNamespace != nil {
+			s.abstractNamespace.Remove(s.abstractName, s)
+		}
+	})
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (s *SocketVFS2) Release(ctx context.Context) {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef(ctx)
+}
+
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
-	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
@@ -105,7 +129,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
 			return ep, err
 		}
 
@@ -118,15 +142,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr
 // Accept implements the linux syscall accept(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
-	// Issue the accept request to get the new endpoint.
-	ep, err := s.ep.Accept()
+	var peerAddr *tcpip.FullAddress
+	if peerRequested {
+		peerAddr = &tcpip.FullAddress{}
+	}
+	ep, err := s.ep.Accept(peerAddr)
 	if err != nil {
 		if err != syserr.ErrWouldBlock || !blocking {
 			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
-		ep, err = s.blockingAccept(t)
+		ep, err = s.blockingAccept(t, peerAddr)
 		if err != nil {
 			return 0, nil, 0, err
 		}
@@ -144,13 +171,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block
 
 	var addr linux.SockAddr
 	var addrLen uint32
-	if peerRequested {
-		// Get address of the peer.
-		var err *syserr.Error
-		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
-		if err != nil {
-			return 0, nil, 0, err
-		}
+	if peerAddr != nil {
+		addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr)
 	}
 
 	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
@@ -246,13 +268,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	return dst.CopyOutFrom(ctx, &EndpointReader{
+	r := &EndpointReader{
 		Ctx:       ctx,
 		Endpoint:  s.ep,
 		NumRights: 0,
 		Peek:      false,
 		From:      nil,
-	})
+	}
+	n, err := dst.CopyOutFrom(ctx, r)
+	// Drop control messages.
+	r.Control.Release(ctx)
+	return n, err
 }
 
 // PWrite implements vfs.FileDescriptionImpl.
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 0ea4aab8b..563d60578 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -12,10 +12,12 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/log",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/time",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/state/statefile",
         "//pkg/syserror",
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index a06c9b8ab..167754537 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -19,10 +19,12 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/state/statefile"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -57,12 +59,14 @@ type SaveOpts struct {
 }
 
 // Save saves the system state.
-func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
+func (opts SaveOpts) Save(ctx context.Context, k *kernel.Kernel, w *watchdog.Watchdog) error {
 	log.Infof("Sandbox save started, pausing all tasks.")
 	k.Pause()
 	k.ReceiveTaskStates()
-	defer k.Unpause()
-	defer log.Infof("Tasks resumed after save.")
+	defer func() {
+		k.Unpause()
+		log.Infof("Tasks resumed after save.")
+	}()
 
 	w.Stop()
 	defer w.Start()
@@ -79,7 +83,7 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
 		err = ErrStateFile{err}
 	} else {
 		// Save the kernel.
-		err = k.SaveTo(wc)
+		err = k.SaveTo(ctx, wc)
 
 		// ENOSPC is a state file error. This error can only come from
 		// writing the state file, and not from fs.FileOperations.Fsync
@@ -106,7 +110,7 @@ type LoadOpts struct {
 }
 
 // Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) error {
+func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
 	// Open the file.
 	r, m, err := statefile.NewReader(opts.Source, opts.Key)
 	if err != nil {
@@ -116,5 +120,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) er
 	previousMetadata = m
 
 	// Restore the Kernel object graph.
-	return k.LoadFrom(r, n, clocks)
+	return k.LoadFrom(ctx, r, n, clocks, vfsOpts)
 }
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 88d5db9fc..a920180d3 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/binary",
         "//pkg/bits",
         "//pkg/eventchannel",
+        "//pkg/marshal/primitive",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
index 5d51a7792..ae3b998c8 100644
--- a/pkg/sentry/strace/epoll.go
+++ b/pkg/sentry/strace/epoll.go
@@ -26,7 +26,7 @@ import (
 
 func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
 	var e linux.EpollEvent
-	if _, err := t.CopyIn(eventAddr, &e); err != nil {
+	if _, err := e.CopyIn(t, eventAddr); err != nil {
 		return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
 	}
 	var sb strings.Builder
@@ -41,7 +41,7 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui
 	addr := eventsAddr
 	for i := uint64(0); i < numEvents; i++ {
 		var e linux.EpollEvent
-		if _, err := t.CopyIn(addr, &e); err != nil {
+		if _, err := e.CopyIn(t, addr); err != nil {
 			fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
 			continue
 		}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index b51c4c941..cc5f70cd4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
@@ -166,7 +167,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 	}
 
 	buf := make([]byte, length)
-	if _, err := t.CopyIn(addr, &buf); err != nil {
+	if _, err := t.CopyInBytes(addr, buf); err != nil {
 		return fmt.Sprintf("%#x (error decoding control: %v)", addr, err)
 	}
 
@@ -302,7 +303,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string {
 	var msg slinux.MessageHeader64
-	if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err)
 	}
 	s := fmt.Sprintf(
@@ -380,9 +381,9 @@ func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) str
 
 func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) {
 	// socklen_t is 32-bits.
-	var l uint32
-	_, err := t.CopyIn(addr, &l)
-	return l, err
+	var l primitive.Uint32
+	_, err := l.CopyIn(t, addr)
+	return uint32(l), err
 }
 
 func sockLenPointer(t *kernel.Task, addr usermem.Addr) string {
@@ -436,22 +437,22 @@ func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, o
 func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string {
 	switch optLen {
 	case 1:
-		var v uint8
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint8
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
 		return fmt.Sprintf("%#x {value=%v}", optVal, v)
 	case 2:
-		var v uint16
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint16
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
 		return fmt.Sprintf("%#x {value=%v}", optVal, v)
 	case 4:
-		var v uint32
-		_, err := t.CopyIn(optVal, &v)
+		var v primitive.Uint32
+		_, err := v.CopyIn(t, optVal)
 		if err != nil {
 			return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err)
 		}
@@ -632,6 +633,8 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.IPV6_UNICAST_IF:          "IPV6_UNICAST_IF",
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
+		linux.IP6T_SO_GET_INFO:         "IP6T_SO_GET_INFO",
+		linux.IP6T_SO_GET_ENTRIES:      "IP6T_SO_GET_ENTRIES",
 	},
 	linux.SOL_NETLINK: {
 		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 87b239730..396744597 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -17,17 +17,16 @@
 package strace
 
 import (
-	"encoding/binary"
 	"fmt"
 	"strconv"
 	"strings"
-	"syscall"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -91,7 +90,7 @@ func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, ma
 		}
 
 		b := make([]byte, size)
-		amt, err := t.CopyIn(ar.Start, b)
+		amt, err := t.CopyInBytes(ar.Start, b)
 		if err != nil {
 			iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err)
 			continue
@@ -118,7 +117,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st
 	}
 
 	b := make([]byte, size)
-	amt, err := t.CopyIn(addr, b)
+	amt, err := t.CopyInBytes(addr, b)
 	if err != nil {
 		return fmt.Sprintf("%#x (error decoding string: %s)", addr, err)
 	}
@@ -199,7 +198,7 @@ func fdVFS2(t *kernel.Task, fd int32) string {
 
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
-	_, err := t.CopyIn(addr, &fds)
+	_, err := primitive.CopyInt32SliceIn(t, addr, fds[:])
 	if err != nil {
 		return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err)
 	}
@@ -209,7 +208,7 @@ func fdpair(t *kernel.Task, addr usermem.Addr) string {
 
 func uname(t *kernel.Task, addr usermem.Addr) string {
 	var u linux.UtsName
-	if _, err := t.CopyIn(addr, &u); err != nil {
+	if _, err := u.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err)
 	}
 
@@ -222,7 +221,7 @@ func utimensTimespec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timespec
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
 	}
 
@@ -244,7 +243,7 @@ func timespec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timespec
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec)
@@ -256,7 +255,7 @@ func timeval(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var tim linux.Timeval
-	if _, err := t.CopyIn(addr, &tim); err != nil {
+	if _, err := tim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err)
 	}
 
@@ -268,8 +267,8 @@ func utimbuf(t *kernel.Task, addr usermem.Addr) string {
 		return "null"
 	}
 
-	var utim syscall.Utimbuf
-	if _, err := t.CopyIn(addr, &utim); err != nil {
+	var utim linux.Utime
+	if _, err := utim.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err)
 	}
 
@@ -282,7 +281,7 @@ func stat(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var stat linux.Stat
-	if _, err := t.CopyIn(addr, &stat); err != nil {
+	if _, err := stat.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec))
@@ -294,7 +293,7 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	interval := timeval(t, addr)
-	value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{})))
+	value := timeval(t, addr+usermem.Addr((*linux.Timeval)(nil).SizeBytes()))
 	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
 }
 
@@ -304,7 +303,7 @@ func itimerspec(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	interval := timespec(t, addr)
-	value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{})))
+	value := timespec(t, addr+usermem.Addr((*linux.Timespec)(nil).SizeBytes()))
 	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
 }
 
@@ -330,7 +329,7 @@ func rusage(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var ru linux.Rusage
-	if _, err := t.CopyIn(addr, &ru); err != nil {
+	if _, err := ru.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err)
 	}
 	return fmt.Sprintf("%#x %+v", addr, ru)
@@ -342,7 +341,7 @@ func capHeader(t *kernel.Task, addr usermem.Addr) string {
 	}
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(addr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, addr); err != nil {
 		return fmt.Sprintf("%#x (error decoding header: %s)", addr, err)
 	}
 
@@ -367,7 +366,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 	}
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err)
 	}
 
@@ -376,7 +375,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 	switch hdr.Version {
 	case linux.LINUX_CAPABILITY_VERSION_1:
 		var data linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := data.CopyIn(t, dataAddr); err != nil {
 			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
 		}
 		p = uint64(data.Permitted)
@@ -384,7 +383,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
 		e = uint64(data.Effective)
 	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
 		var data [2]linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
 			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
 		}
 		p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4a9b04fd0..a2e441448 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -21,6 +21,7 @@ go_library(
         "sys_identity.go",
         "sys_inotify.go",
         "sys_lseek.go",
+        "sys_membarrier.go",
         "sys_mempolicy.go",
         "sys_mmap.go",
         "sys_mount.go",
@@ -56,6 +57,7 @@ go_library(
         "sys_xattr.go",
         "timespec.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
@@ -64,6 +66,8 @@ go_library(
         "//pkg/bpf",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/rand",
         "//pkg/safemem",
@@ -99,7 +103,5 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 80c65164a..36902d177 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{
 		63:  syscalls.Supported("uname", Uname),
 		64:  syscalls.Supported("semget", Semget),
 		65:  syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
-		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
 		67:  syscalls.Supported("shmdt", Shmdt),
 		68:  syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
 		69:  syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
@@ -138,7 +138,7 @@ var AMD64 = &kernel.SyscallTable{
 		83:  syscalls.Supported("mkdir", Mkdir),
 		84:  syscalls.Supported("rmdir", Rmdir),
 		85:  syscalls.Supported("creat", Creat),
-		86:  syscalls.Supported("link", Link),
+		86:  syscalls.PartiallySupported("link", Link, "Limited support with Gofer. Link count and linked files may get out of sync because gVisor is not aware of external hardlinks.", nil),
 		87:  syscalls.Supported("unlink", Unlink),
 		88:  syscalls.Supported("symlink", Symlink),
 		89:  syscalls.Supported("readlink", Readlink),
@@ -305,9 +305,9 @@ var AMD64 = &kernel.SyscallTable{
 		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
 		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
-		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
 		257: syscalls.Supported("openat", Openat),
 		258: syscalls.Supported("mkdirat", Mkdirat),
@@ -317,7 +317,7 @@ var AMD64 = &kernel.SyscallTable{
 		262: syscalls.Supported("fstatat", Fstatat),
 		263: syscalls.Supported("unlinkat", Unlinkat),
 		264: syscalls.Supported("renameat", Renameat),
-		265: syscalls.Supported("linkat", Linkat),
+		265: syscalls.PartiallySupported("linkat", Linkat, "See link(2).", nil),
 		266: syscalls.Supported("symlinkat", Symlinkat),
 		267: syscalls.Supported("readlinkat", Readlinkat),
 		268: syscalls.Supported("fchmodat", Fchmodat),
@@ -346,7 +346,7 @@ var AMD64 = &kernel.SyscallTable{
 		291: syscalls.Supported("epoll_create1", EpollCreate1),
 		292: syscalls.Supported("dup3", Dup3),
 		293: syscalls.Supported("pipe2", Pipe2),
-		294: syscalls.Supported("inotify_init1", InotifyInit1),
+		294: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		295: syscalls.Supported("preadv", Preadv),
 		296: syscalls.Supported("pwritev", Pwritev),
 		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
@@ -376,7 +376,7 @@ var AMD64 = &kernel.SyscallTable{
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		322: syscalls.Supported("execveat", Execveat),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
 		// Syscalls implemented after 325 are "backports" from versions
@@ -454,9 +454,9 @@ var ARM64 = &kernel.SyscallTable{
 		23:  syscalls.Supported("dup", Dup),
 		24:  syscalls.Supported("dup3", Dup3),
 		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
-		26:  syscalls.Supported("inotify_init1", InotifyInit1),
-		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		26:  syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
+		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil),
 		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
 		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
 		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
@@ -527,8 +527,8 @@ var ARM64 = &kernel.SyscallTable{
 		96:  syscalls.Supported("set_tid_address", SetTidAddress),
 		97:  syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
 		98:  syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
-		99:  syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		99:  syscalls.Supported("set_robust_list", SetRobustList),
+		100: syscalls.Supported("get_robust_list", GetRobustList),
 		101: syscalls.Supported("nanosleep", Nanosleep),
 		102: syscalls.Supported("getitimer", Getitimer),
 		103: syscalls.Supported("setitimer", Setitimer),
@@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{
 		188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		190: syscalls.Supported("semget", Semget),
-		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
 		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
 		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
@@ -695,7 +695,7 @@ var ARM64 = &kernel.SyscallTable{
 		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
 		281: syscalls.Supported("execveat", Execveat),
 		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil),
 		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
 		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index e9d64dec5..0bf313a13 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -17,6 +17,7 @@ package linux
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -36,7 +37,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	//
 	// The context pointer _must_ be zero initially.
 	var idIn uint64
-	if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+	if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
 		return 0, nil, err
 	}
 	if idIn != 0 {
@@ -49,7 +50,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Copy out the new ID.
-	if _, err := t.CopyOut(idAddr, &id); err != nil {
+	if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
 		t.MemoryManager().DestroyAIOContext(t, id)
 		return 0, nil, err
 	}
@@ -142,7 +143,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		ev := v.(*linux.IOEvent)
 
 		// Copy out the result.
-		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+		if _, err := ev.CopyOut(t, eventsAddr); err != nil {
 			if count > 0 {
 				return uintptr(count), nil, nil
 			}
@@ -338,21 +339,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	for i := int32(0); i < nrEvents; i++ {
-		// Copy in the address.
-		cbAddrNative := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
-			if i > 0 {
-				// Some successful.
-				return uintptr(i), nil, nil
+		// Copy in the callback address.
+		var cbAddr usermem.Addr
+		switch t.Arch().Width() {
+		case 8:
+			var cbAddrP primitive.Uint64
+			if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+				if i > 0 {
+					// Some successful.
+					return uintptr(i), nil, nil
+				}
+				// Nothing done.
+				return 0, nil, err
 			}
-			// Nothing done.
-			return 0, nil, err
+			cbAddr = usermem.Addr(cbAddrP)
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
 
 		// Copy in this callback.
 		var cb linux.IOCallback
-		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
-		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+		if _, err := cb.CopyIn(t, cbAddr); err != nil {
 
 			if i > 0 {
 				// Some have been successful.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index adf5ea5f2..d3b85e11b 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -45,7 +45,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	dataAddr := args[1].Pointer()
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return 0, nil, err
 	}
 	// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
@@ -65,7 +65,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			Permitted:   uint32(p),
 			Inheritable: uint32(i),
 		}
-		_, err = t.CopyOut(dataAddr, &data)
+		_, err = data.CopyOut(t, dataAddr)
 		return 0, nil, err
 
 	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
@@ -88,12 +88,12 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 				Inheritable: uint32(i >> 32),
 			},
 		}
-		_, err = t.CopyOut(dataAddr, &data)
+		_, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:])
 		return 0, nil, err
 
 	default:
 		hdr.Version = linux.HighestCapabilityVersion
-		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+		if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
 			return 0, nil, err
 		}
 		if dataAddr != 0 {
@@ -109,7 +109,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	dataAddr := args[1].Pointer()
 
 	var hdr linux.CapUserHeader
-	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+	if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
 		return 0, nil, err
 	}
 	switch hdr.Version {
@@ -118,7 +118,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.EPERM
 		}
 		var data linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := data.CopyIn(t, dataAddr); err != nil {
 			return 0, nil, err
 		}
 		p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
@@ -131,7 +131,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			return 0, nil, syserror.EPERM
 		}
 		var data [2]linux.CapUserData
-		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+		if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
 			return 0, nil, err
 		}
 		p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
@@ -141,7 +141,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	default:
 		hdr.Version = linux.HighestCapabilityVersion
-		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+		if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 256422689..519066a47 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
@@ -83,6 +84,7 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro
 		}
 		rel = f.Dirent
 		if !fs.IsDir(rel.Inode.StableAttr) {
+			f.DecRef(t)
 			return syserror.ENOTDIR
 		}
 	}
@@ -601,19 +603,19 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Shared flags between file and socket.
 	switch request {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 	case linux.FIOCLEX:
-		t.FDTable().SetFlags(fd, kernel.FDFlags{
+		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
 
 	case linux.FIONBIO:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.Flags()
@@ -627,7 +629,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOASYNC:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.Flags()
@@ -641,15 +643,14 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOSETOWN, linux.SIOCSPGRP:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		fSetOwn(t, file, set)
 		return 0, nil, nil
 
 	case linux.FIOGETOWN, linux.SIOCGPGRP:
-		who := fGetOwn(t, file)
-		_, err := t.CopyOut(args[2].Pointer(), &who)
+		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
 		return 0, nil, err
 
 	default:
@@ -694,7 +695,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// Top it off with a terminator.
-	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	_, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00"))
 	return uintptr(bytes + 1), nil, err
 }
 
@@ -787,7 +788,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	file, _ := t.FDTable().Remove(fd)
+	file, _ := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -941,7 +942,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlags(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
@@ -962,7 +963,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		// Copy in the lock request.
 		flockAddr := args[2].Pointer()
 		var flock linux.Flock
-		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+		if _, err := flock.CopyIn(t, flockAddr); err != nil {
 			return 0, nil, err
 		}
 
@@ -1052,12 +1053,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_GETOWN_EX:
 		addr := args[2].Pointer()
 		owner := fGetOwnEx(t, file)
-		_, err := t.CopyOut(addr, &owner)
+		_, err := owner.CopyOut(t, addr)
 		return 0, nil, err
 	case linux.F_SETOWN_EX:
 		addr := args[2].Pointer()
 		var owner linux.FOwnerEx
-		_, err := t.CopyIn(addr, &owner)
+		_, err := owner.CopyIn(t, addr)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -1154,6 +1155,10 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1918,7 +1923,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times linux.Utime
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
 			return 0, nil, err
 		}
 		ts = fs.TimeSpec{
@@ -1938,7 +1943,7 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timeval
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		ts = fs.TimeSpec{
@@ -1966,7 +1971,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timespec
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
@@ -2000,7 +2005,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ts := defaultSetToSystemTimeSpec()
 	if timesAddr != 0 {
 		var times [2]linux.Timeval
-		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 			return 0, nil, err
 		}
 		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 12b2fa690..f39ce0639 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -306,8 +306,8 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	// Despite the syscall using the name 'pid' for this variable, it is
 	// very much a tid.
 	tid := args[0].Int()
-	head := args[1].Pointer()
-	size := args[2].Pointer()
+	headAddr := args[1].Pointer()
+	sizeAddr := args[2].Pointer()
 
 	if tid < 0 {
 		return 0, nil, syserror.EINVAL
@@ -321,12 +321,16 @@ func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	}
 
 	// Copy out head pointer.
-	if _, err := t.CopyOut(head, uint64(ot.GetRobustList())); err != nil {
+	head := t.Arch().Native(uintptr(ot.GetRobustList()))
+	if _, err := head.CopyOut(t, headAddr); err != nil {
 		return 0, nil, err
 	}
 
-	// Copy out size, which is a constant.
-	if _, err := t.CopyOut(size, uint64(linux.SizeOfRobustListHead)); err != nil {
+	// Copy out size, which is a constant. Note that while size isn't
+	// an address, it is defined as the arch-dependent size_t, so it
+	// needs to be converted to a native-sized int.
+	size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
+	if _, err := size.CopyOut(t, sizeAddr); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 59004cefe..b25f7d881 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -19,7 +19,6 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -93,19 +92,23 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir
 	}
 }
 
-// oldDirentHdr is a fixed sized header matching the fixed size
-// fields found in the old linux dirent struct.
+// oldDirentHdr is a fixed sized header matching the fixed size fields found in
+// the old linux dirent struct.
+//
+// +marshal
 type oldDirentHdr struct {
 	Ino    uint64
 	Off    uint64
-	Reclen uint16
+	Reclen uint16 `marshal:"unaligned"` // Struct ends mid-word.
 }
 
-// direntHdr is a fixed sized header matching the fixed size
-// fields found in the new linux dirent struct.
+// direntHdr is a fixed sized header matching the fixed size fields found in the
+// new linux dirent struct.
+//
+// +marshal
 type direntHdr struct {
 	OldHdr oldDirentHdr
-	Typ    uint8
+	Typ    uint8 `marshal:"unaligned"` // Struct ends mid-word.
 }
 
 // dirent contains the data pointed to by a new linux dirent struct.
@@ -134,20 +137,20 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
 // the old linux dirent format.
 func smallestDirent(a arch.Context) uint {
 	d := dirent{}
-	return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+	return uint(d.Hdr.OldHdr.SizeBytes()) + a.Width() + 1
 }
 
 // smallestDirent64 returns the size of the smallest possible dirent using
 // the new linux dirent format.
 func smallestDirent64(a arch.Context) uint {
 	d := dirent{}
-	return uint(binary.Size(d.Hdr)) + a.Width()
+	return uint(d.Hdr.SizeBytes()) + a.Width()
 }
 
 // padRec pads the name field until the rec length is a multiple of the width,
 // which must be a power of 2. It returns the padded rec length.
 func (d *dirent) padRec(width int) uint16 {
-	a := int(binary.Size(d.Hdr)) + len(d.Name)
+	a := d.Hdr.SizeBytes() + len(d.Name)
 	r := (a + width) &^ (width - 1)
 	padding := r - a
 	d.Name = append(d.Name, make([]byte, padding)...)
@@ -157,7 +160,7 @@ func (d *dirent) padRec(width int) uint16 {
 // Serialize64 serializes a Dirent struct to a byte slice, keeping the new
 // linux dirent format. Returns the number of bytes serialized or an error.
 func (d *dirent) Serialize64(w io.Writer) (int, error) {
-	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+	n1, err := d.Hdr.WriteTo(w)
 	if err != nil {
 		return 0, err
 	}
@@ -165,14 +168,14 @@ func (d *dirent) Serialize64(w io.Writer) (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	return n1 + n2, nil
+	return int(n1) + n2, nil
 }
 
 // Serialize serializes a Dirent struct to a byte slice, using the old linux
 // dirent format.
 // Returns the number of bytes serialized or an error.
 func (d *dirent) Serialize(w io.Writer) (int, error) {
-	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+	n1, err := d.Hdr.OldHdr.WriteTo(w)
 	if err != nil {
 		return 0, err
 	}
@@ -184,7 +187,7 @@ func (d *dirent) Serialize(w io.Writer) (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	return n1 + n2 + n3, nil
+	return int(n1) + n2 + n3, nil
 }
 
 // direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 715ac45e6..a29d307e5 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -49,13 +49,13 @@ func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
 	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
 	suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
-	if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+	if _, err := ruid.CopyOut(t, ruidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(euidAddr, euid); err != nil {
+	if _, err := euid.CopyOut(t, euidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(suidAddr, suid); err != nil {
+	if _, err := suid.CopyOut(t, suidAddr); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, nil
@@ -84,13 +84,13 @@ func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
 	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
 	sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
-	if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+	if _, err := rgid.CopyOut(t, rgidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(egidAddr, egid); err != nil {
+	if _, err := egid.CopyOut(t, egidAddr); err != nil {
 		return 0, nil, err
 	}
-	if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+	if _, err := sgid.CopyOut(t, sgidAddr); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, nil
@@ -157,7 +157,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	for i, kgid := range kgids {
 		gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
 	}
-	if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+	if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil {
 		return 0, nil, err
 	}
 	return uintptr(len(gids)), nil, nil
@@ -173,7 +173,7 @@ func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, t.SetExtraGIDs(nil)
 	}
 	gids := make([]auth.GID, size)
-	if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+	if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil {
 		return 0, nil, err
 	}
 	return 0, nil, t.SetExtraGIDs(gids)
diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go
new file mode 100644
index 000000000..63ee5d435
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_membarrier.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Membarrier implements syscall membarrier(2).
+func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	cmd := args[0].Int()
+	flags := args[1].Uint()
+
+	switch cmd {
+	case linux.MEMBARRIER_CMD_QUERY:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		var supportedCommands uintptr
+		if t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL |
+				linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED |
+				linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |
+				linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED |
+				linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
+		}
+		if t.RSeqAvailable() {
+			supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ |
+				linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+		}
+		return supportedCommands, nil, nil
+	case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() {
+			return 0, nil, syserror.EPERM
+		}
+		return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier()
+	case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		// no-op
+		return 0, nil, nil
+	case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
+			return 0, nil, syserror.EINVAL
+		}
+		t.MemoryManager().EnableMembarrierPrivate()
+		return 0, nil, nil
+	case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+		if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.RSeqAvailable() {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.MemoryManager().IsMembarrierRSeqEnabled() {
+			return 0, nil, syserror.EPERM
+		}
+		// MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have
+		// the ability to preempt specific CPUs.
+		return 0, nil, t.Kernel().Platform.PreemptAllCPUs()
+	case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+		if flags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if !t.RSeqAvailable() {
+			return 0, nil, syserror.EINVAL
+		}
+		t.MemoryManager().EnableMembarrierRSeq()
+		return 0, nil, nil
+	default:
+		// Probably a command we don't implement.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index d0109baa4..cd8dfdfa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		if err := file.ConfigureMMap(t, &opts); err != nil {
 			return 0, nil, err
 		}
+	} else if shared {
+		// Back shared anonymous mappings with a special mappable.
+		opts.Offset = 0
+		m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
+		if err != nil {
+			return 0, nil, err
+		}
+		opts.MappingIdentity = m // transfers ownership of m to opts
+		opts.Mappable = m
 	}
 
 	rv, err := t.MemoryManager().MMap(t, opts)
@@ -239,7 +248,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		return 0, nil, syserror.ENOMEM
 	}
 	resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
-	_, err := t.CopyOut(vec, resident)
+	_, err := t.CopyOutBytes(vec, resident)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3149e4aad..849a47476 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -46,9 +47,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 		return 0, err
 	}
 
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 789e2ed5b..254f4c9f9 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -162,7 +162,7 @@ func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
 
 	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
-		if _, err := t.CopyIn(addr, &pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
 			return nil, err
 		}
 	}
@@ -189,7 +189,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	// The poll entries are copied out regardless of whether
 	// any are set or not. This aligns with the Linux behavior.
 	if nfds > 0 && err == nil {
-		if _, err := t.CopyOut(addr, pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
 			return remainingTimeout, 0, err
 		}
 	}
@@ -202,7 +202,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
 	set := make([]byte, nBytes)
 
 	if addr != 0 {
-		if _, err := t.CopyIn(addr, &set); err != nil {
+		if _, err := t.CopyInBytes(addr, set); err != nil {
 			return nil, err
 		}
 		// If we only use part of the last byte, mask out the extraneous bits.
@@ -329,19 +329,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 
 	// Copy updated vectors back.
 	if readFDs != 0 {
-		if _, err := t.CopyOut(readFDs, r); err != nil {
+		if _, err := t.CopyOutBytes(readFDs, r); err != nil {
 			return 0, err
 		}
 	}
 
 	if writeFDs != 0 {
-		if _, err := t.CopyOut(writeFDs, w); err != nil {
+		if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
 			return 0, err
 		}
 	}
 
 	if exceptFDs != 0 {
-		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+		if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
 			return 0, err
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 64a725296..a892d2c62 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -43,7 +44,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, nil
 
 	case linux.PR_GET_PDEATHSIG:
-		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+		_, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal()))
 		return 0, nil, err
 
 	case linux.PR_GET_DUMPABLE:
@@ -110,7 +111,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			buf[len] = 0
 			len++
 		}
-		_, err := t.CopyOut(addr, buf[:len])
+		_, err := t.CopyOutBytes(addr, buf[:len])
 		if err != nil {
 			return 0, nil, err
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d5d5b6959..309c183a3 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -26,17 +27,13 @@ import (
 // rlimit describes an implementation of 'struct rlimit', which may vary from
 // system-to-system.
 type rlimit interface {
+	marshal.Marshallable
+
 	// toLimit converts an rlimit to a limits.Limit.
 	toLimit() *limits.Limit
 
 	// fromLimit converts a limits.Limit to an rlimit.
 	fromLimit(lim limits.Limit)
-
-	// copyIn copies an rlimit from the untrusted app to the kernel.
-	copyIn(t *kernel.Task, addr usermem.Addr) error
-
-	// copyOut copies an rlimit from the kernel to the untrusted app.
-	copyOut(t *kernel.Task, addr usermem.Addr) error
 }
 
 // newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
@@ -50,6 +47,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) {
 	}
 }
 
+// +marshal
 type rlimit64 struct {
 	Cur uint64
 	Max uint64
@@ -70,12 +68,12 @@ func (r *rlimit64) fromLimit(lim limits.Limit) {
 }
 
 func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
-	_, err := t.CopyIn(addr, r)
+	_, err := r.CopyIn(t, addr)
 	return err
 }
 
 func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
-	_, err := t.CopyOut(addr, *r)
+	_, err := r.CopyOut(t, addr)
 	return err
 }
 
@@ -140,7 +138,8 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 	rlim.fromLimit(lim)
-	return 0, nil, rlim.copyOut(t, addr)
+	_, err = rlim.CopyOut(t, addr)
+	return 0, nil, err
 }
 
 // Setrlimit implements linux syscall setrlimit(2).
@@ -155,7 +154,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := rlim.copyIn(t, addr); err != nil {
+	if _, err := rlim.CopyIn(t, addr); err != nil {
 		return 0, nil, syserror.EFAULT
 	}
 	_, err = prlimit64(t, resource, rlim.toLimit())
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 1674c7445..ac5c98a54 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -80,7 +80,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 
 	ru := getrusage(t, which)
-	_, err := t.CopyOut(addr, &ru)
+	_, err := ru.CopyOut(t, addr)
 	return 0, nil, err
 }
 
@@ -104,7 +104,7 @@ func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		CUTime: linux.ClockTFromDuration(cs2.UserTime),
 		CSTime: linux.ClockTFromDuration(cs2.SysTime),
 	}
-	if _, err := t.CopyOut(addr, &r); err != nil {
+	if _, err := r.CopyOut(t, addr); err != nil {
 		return 0, nil, err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index 99f6993f5..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -27,8 +27,10 @@ const (
 )
 
 // SchedParam replicates struct sched_param in sched.h.
+//
+// +marshal
 type SchedParam struct {
-	schedPriority int64
+	schedPriority int32
 }
 
 // SchedGetparam implements linux syscall sched_getparam(2).
@@ -45,7 +47,7 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 		return 0, nil, syserror.ESRCH
 	}
 	r := SchedParam{schedPriority: onlyPriority}
-	if _, err := t.CopyOut(param, r); err != nil {
+	if _, err := r.CopyOut(t, param); err != nil {
 		return 0, nil, err
 	}
 
@@ -79,7 +81,7 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke
 		return 0, nil, syserror.ESRCH
 	}
 	var r SchedParam
-	if _, err := t.CopyIn(param, &r); err != nil {
+	if _, err := r.CopyIn(t, param); err != nil {
 		return 0, nil, syserror.EINVAL
 	}
 	if r.schedPriority != onlyPriority {
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 5b7a66f4d..4fdb4463c 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -24,6 +24,8 @@ import (
 )
 
 // userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+//
+// +marshal
 type userSockFprog struct {
 	// Len is the length of the filter in BPF instructions.
 	Len uint16
@@ -33,7 +35,7 @@ type userSockFprog struct {
 	// Filter is a user pointer to the struct sock_filter array that makes up
 	// the filter program. Filter is a uint64 rather than a usermem.Addr
 	// because usermem.Addr is actually uintptr, which is not a fixed-size
-	// type, and encoding/binary.Read objects to this.
+	// type.
 	Filter uint64
 }
 
@@ -54,11 +56,11 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
 	}
 
 	var fprog userSockFprog
-	if _, err := t.CopyIn(addr, &fprog); err != nil {
+	if _, err := fprog.CopyIn(t, addr); err != nil {
 		return err
 	}
 	filter := make([]linux.BPFInstruction, int(fprog.Len))
-	if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+	if _, err := linux.CopyBPFInstructionSliceIn(t, usermem.Addr(fprog.Filter), filter); err != nil {
 		return err
 	}
 	compiledFilter, err := bpf.Compile(filter)
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 5f54f2456..c2d4bf805 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -66,7 +67,7 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	ops := make([]linux.Sembuf, nsops)
-	if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+	if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
 		return 0, nil, err
 	}
 
@@ -116,8 +117,8 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	case linux.IPC_SET:
 		arg := args[3].Pointer()
-		s := linux.SemidDS{}
-		if _, err := t.CopyIn(arg, &s); err != nil {
+		var s linux.SemidDS
+		if _, err := s.CopyIn(t, arg); err != nil {
 			return 0, nil, err
 		}
 
@@ -128,9 +129,17 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		v, err := getPID(t, id, num)
 		return uintptr(v), nil, err
 
+	case linux.IPC_STAT:
+		arg := args[3].Pointer()
+		ds, err := ipcStat(t, id)
+		if err == nil {
+			_, err = ds.CopyOut(t, arg)
+		}
+
+		return 0, nil, err
+
 	case linux.IPC_INFO,
 		linux.SEM_INFO,
-		linux.IPC_STAT,
 		linux.SEM_STAT,
 		linux.SEM_STAT_ANY,
 		linux.GETNCNT,
@@ -170,6 +179,16 @@ func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FileP
 	return set.Change(t, creds, owner, perms)
 }
 
+func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return nil, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.GetStat(creds)
+}
+
 func setVal(t *kernel.Task, id int32, num int32, val int16) error {
 	r := t.IPCNamespace().SemaphoreRegistry()
 	set := r.FindByID(id)
@@ -188,7 +207,7 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 		return syserror.EINVAL
 	}
 	vals := make([]uint16, set.Size())
-	if _, err := t.CopyIn(array, vals); err != nil {
+	if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil {
 		return err
 	}
 	creds := auth.CredentialsFromContext(t)
@@ -217,7 +236,7 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	_, err = t.CopyOut(array, vals)
+	_, err = primitive.CopyUint16SliceOut(t, array, vals)
 	return err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index f0ae8fa8e..584064143 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -112,18 +112,18 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 		stat, err := segment.IPCStat(t)
 		if err == nil {
-			_, err = t.CopyOut(buf, stat)
+			_, err = stat.CopyOut(t, buf)
 		}
 		return 0, nil, err
 
 	case linux.IPC_INFO:
 		params := r.IPCInfo()
-		_, err := t.CopyOut(buf, params)
+		_, err := params.CopyOut(t, buf)
 		return 0, nil, err
 
 	case linux.SHM_INFO:
 		info := r.ShmInfo()
-		_, err := t.CopyOut(buf, info)
+		_, err := info.CopyOut(t, buf)
 		return 0, nil, err
 	}
 
@@ -137,11 +137,10 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	switch cmd {
 	case linux.IPC_SET:
 		var ds linux.ShmidDS
-		_, err = t.CopyIn(buf, &ds)
-		if err != nil {
+		if _, err = ds.CopyIn(t, buf); err != nil {
 			return 0, nil, err
 		}
-		err = segment.Set(t, &ds)
+		err := segment.Set(t, &ds)
 		return 0, nil, err
 
 	case linux.IPC_RMID:
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 38f573c14..9cd052c3d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,8 +31,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // LINT.IfChange
@@ -67,10 +67,10 @@ const flagsOffset = 48
 const sizeOfInt32 = 4
 
 // messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
 
 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
 
 // baseRecvFlags are the flags that are accepted across recvmsg(2),
 // recvmmsg(2), and recvfrom(2).
@@ -78,6 +78,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
 
 // MessageHeader64 is the 64-bit representation of the msghdr struct used in
 // the recvmsg and sendmsg syscalls.
+//
+// +marshal
 type MessageHeader64 struct {
 	// Name is the optional pointer to a network address buffer.
 	Name uint64
@@ -106,30 +108,14 @@ type MessageHeader64 struct {
 
 // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
 // the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
 type multipleMessageHeader64 struct {
 	msgHdr MessageHeader64
 	msgLen uint32
 	_      int32
 }
 
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
-	b := t.CopyScratchBuffer(52)
-	if _, err := t.CopyInBytes(addr, b); err != nil {
-		return err
-	}
-
-	msg.Name = usermem.ByteOrder.Uint64(b[0:])
-	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
-	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
-	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
-	msg.Control = usermem.ByteOrder.Uint64(b[32:])
-	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
-	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
-	return nil
-}
-
 // CaptureAddress allocates memory for and copies a socket address structure
 // from the untrusted address space range.
 func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -148,10 +134,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
 // writeAddress writes a sockaddr structure and its length to an output buffer
 // in the unstrusted address space range. If the address is bigger than the
 // buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
 	// Get the buffer length.
 	var bufLen uint32
-	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+	if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
 		return err
 	}
 
@@ -160,7 +146,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Write the length unconditionally.
-	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+	if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
 		return err
 	}
 
@@ -173,7 +159,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Copy as much of the address as will fit in the buffer.
-	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+	addr.MarshalUnsafe(encodedAddr)
 	if bufLen > uint32(len(encodedAddr)) {
 		bufLen = uint32(len(encodedAddr))
 	}
@@ -247,9 +234,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Copy the file descriptors out.
-	if _, err := t.CopyOut(socks, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, socks, fds); err != nil {
 		for _, fd := range fds {
-			if file, _ := t.FDTable().Remove(fd); file != nil {
+			if file, _ := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
@@ -456,8 +443,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Read the length. Reject negative values.
-	optLen := int32(0)
-	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+	var optLen int32
+	if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
 		return 0, nil, err
 	}
 	if optLen < 0 {
@@ -471,7 +458,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	vLen := int32(binary.Size(v))
-	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+	if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
 		return 0, nil, err
 	}
 
@@ -733,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -748,7 +735,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
 	// Capture the message header and io vectors.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -780,7 +767,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
-			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+			if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 				return 0, err
 			}
 		}
@@ -817,17 +804,17 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	}
 
 	// Copy the control data to the caller.
-	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+	if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
 		return 0, err
 	}
 	if len(controlData) > 0 {
-		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+		if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
 
 	// Copy out the flags to the caller.
-	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+	if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 		return 0, err
 	}
 
@@ -996,7 +983,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -1011,7 +998,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
 	// Capture the message header.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -1022,7 +1009,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 			return 0, syserror.ENOBUFS
 		}
 		controlData = make([]byte, msg.ControlLen)
-		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+		if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
@@ -1065,7 +1052,9 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
-	if err != nil {
+	// Control messages should be released on error as well as for zero-length
+	// messages, which are discarded by the receiver.
+	if n == 0 || err != nil {
 		controlMessages.Release(t)
 	}
 	return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index c69941feb..46616c961 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -141,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 		// Copy in the offset.
 		var offset int64
-		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+		if _, err := primitive.CopyInt64In(t, offsetAddr, &offset); err != nil {
 			return 0, nil, err
 		}
 
@@ -149,11 +150,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
 			SrcOffset: true,
-			SrcStart:  offset,
+			SrcStart:  int64(offset),
 		}, outFile.Flags().NonBlocking)
 
 		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, offsetAddr, offset+n); err != nil {
 			return 0, nil, err
 		}
 	} else {
@@ -228,7 +229,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			}
 
 			var offset int64
-			if _, err := t.CopyIn(outOffset, &offset); err != nil {
+			if _, err := primitive.CopyInt64In(t, outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
 
@@ -246,7 +247,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			}
 
 			var offset int64
-			if _, err := t.CopyIn(inOffset, &offset); err != nil {
+			if _, err := primitive.CopyInt64In(t, inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
 
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index a5826f2dd..cda29a8b5 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -221,7 +221,7 @@ func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr
 		DevMajor:  uint32(devMajor),
 		DevMinor:  devMinor,
 	}
-	_, err := t.CopyOut(statxAddr, &s)
+	_, err := s.CopyOut(t, statxAddr)
 	return err
 }
 
@@ -283,7 +283,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 		FragmentSize: d.Inode.StableAttr.BlockSize,
 		// Leave other fields 0 like simple_statfs does.
 	}
-	_, err = t.CopyOut(addr, &statfs)
+	_, err = statfs.CopyOut(t, addr)
 	return err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 297de052a..db3d924d9 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -21,13 +21,17 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
-// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+// Sysinfo implements Linux syscall sysinfo(2).
 func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 
 	mf := t.Kernel().MemoryFile()
-	mf.UpdateUsage()
-	_, totalUsage := usage.MemoryAccounting.Copy()
+	mfUsage, err := mf.TotalUsage()
+	if err != nil {
+		return 0, nil, err
+	}
+	memStats, _ := usage.MemoryAccounting.Copy()
+	totalUsage := mfUsage + memStats.Mapped
 	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 	memFree := totalSize - totalUsage
 	if memFree > totalSize {
@@ -37,12 +41,12 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	// Only a subset of the fields in sysinfo_t make sense to return.
 	si := linux.Sysinfo{
-		Procs:    uint16(len(t.PIDNamespace().Tasks())),
+		Procs:    uint16(t.Kernel().TaskSet().Root.NumTasks()),
 		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
 		TotalRAM: totalSize,
 		FreeRAM:  memFree,
 		Unit:     1,
 	}
-	_, err := t.CopyOut(addr, si)
+	_, err = si.CopyOut(t, addr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 101096038..39ca9ea97 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
@@ -311,13 +312,13 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
 		return 0, err
 	}
 	if statusAddr != 0 {
-		if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+		if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil {
 			return 0, err
 		}
 	}
 	if rusageAddr != 0 {
 		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
-		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
 			return 0, err
 		}
 	}
@@ -395,14 +396,14 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			// as well.
 			if infop != 0 {
 				var si arch.SignalInfo
-				_, err = t.CopyOut(infop, &si)
+				_, err = si.CopyOut(t, infop)
 			}
 		}
 		return 0, nil, err
 	}
 	if rusageAddr != 0 {
 		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
-		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -441,7 +442,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	default:
 		t.Warningf("waitid got incomprehensible wait status %d", s)
 	}
-	_, err = t.CopyOut(infop, &si)
+	_, err = si.CopyOut(t, infop)
 	return 0, nil, err
 }
 
@@ -558,9 +559,7 @@ func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	// third argument to this system call is nowadays unused.
 
 	if cpu != 0 {
-		buf := t.CopyScratchBuffer(4)
-		usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
-		if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+		if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
 			return 0, nil, err
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index a2a24a027..c5054d2f1 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -19,6 +19,7 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -168,7 +169,7 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		return uintptr(r), nil, nil
 	}
 
-	if _, err := t.CopyOut(addr, r); err != nil {
+	if _, err := r.CopyOut(t, addr); err != nil {
 		return 0, nil, err
 	}
 	return uintptr(r), nil, nil
@@ -334,8 +335,8 @@ func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		// Ask the time package for the timezone.
 		_, offset := time.Now().Zone()
 		// This int32 array mimics linux's struct timezone.
-		timezone := [2]int32{-int32(offset) / 60, 0}
-		_, err := t.CopyOut(tz, timezone)
+		timezone := []int32{-int32(offset) / 60, 0}
+		_, err := primitive.CopyInt32SliceOut(t, tz, timezone)
 		return 0, nil, err
 	}
 	return 0, nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index a4c400f87..45eef4feb 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -21,81 +21,63 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const nsecPerSec = int64(time.Second)
 
-// copyItimerValIn copies an ItimerVal from the untrusted app range to the
-// kernel.  The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed because because Linux allows
-// setitimer(which, NULL, &old_value) which disables the timer.
-// There is a KERN_WARN message saying this misfeature will be removed.
-// However, that hasn't happened as of 3.19, so we continue to support it.
-func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
-	if addr == usermem.Addr(0) {
-		return linux.ItimerVal{}, nil
-	}
-
-	switch t.Arch().Width() {
-	case 8:
-		// Native size, just copy directly.
-		var itv linux.ItimerVal
-		if _, err := t.CopyIn(addr, &itv); err != nil {
-			return linux.ItimerVal{}, err
-		}
-
-		return itv, nil
-	default:
-		return linux.ItimerVal{}, syserror.ENOSYS
-	}
-}
-
-// copyItimerValOut copies an ItimerVal to the untrusted app range.
-// The ItimerVal may be either 32 or 64 bits.
-// A NULL address is allowed, in which case no copy takes place
-func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
-	if addr == usermem.Addr(0) {
-		return nil
-	}
-
-	switch t.Arch().Width() {
-	case 8:
-		// Native size, just copy directly.
-		_, err := t.CopyOut(addr, itv)
-		return err
-	default:
-		return syserror.ENOSYS
-	}
-}
-
 // Getitimer implements linux syscall getitimer(2).
 func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	if t.Arch().Width() != 8 {
+		// Definition of linux.ItimerVal assumes 64-bit architecture.
+		return 0, nil, syserror.ENOSYS
+	}
+
 	timerID := args[0].Int()
-	val := args[1].Pointer()
+	addr := args[1].Pointer()
 
 	olditv, err := t.Getitimer(timerID)
 	if err != nil {
 		return 0, nil, err
 	}
-	return 0, nil, copyItimerValOut(t, val, &olditv)
+	// A NULL address is allowed, in which case no copy out takes place.
+	if addr == 0 {
+		return 0, nil, nil
+	}
+	_, err = olditv.CopyOut(t, addr)
+	return 0, nil, err
 }
 
 // Setitimer implements linux syscall setitimer(2).
 func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	timerID := args[0].Int()
-	newVal := args[1].Pointer()
-	oldVal := args[2].Pointer()
+	if t.Arch().Width() != 8 {
+		// Definition of linux.ItimerVal assumes 64-bit architecture.
+		return 0, nil, syserror.ENOSYS
+	}
 
-	newitv, err := copyItimerValIn(t, newVal)
-	if err != nil {
-		return 0, nil, err
+	timerID := args[0].Int()
+	newAddr := args[1].Pointer()
+	oldAddr := args[2].Pointer()
+
+	var newitv linux.ItimerVal
+	// A NULL address is allowed because because Linux allows
+	// setitimer(which, NULL, &old_value) which disables the timer. There is a
+	// KERN_WARN message saying this misfeature will be removed. However, that
+	// hasn't happened as of 3.19, so we continue to support it.
+	if newAddr != 0 {
+		if _, err := newitv.CopyIn(t, newAddr); err != nil {
+			return 0, nil, err
+		}
 	}
 	olditv, err := t.Setitimer(timerID, newitv)
 	if err != nil {
 		return 0, nil, err
 	}
-	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+	// A NULL address is allowed, in which case no copy out takes place.
+	if oldAddr == 0 {
+		return 0, nil, nil
+	}
+	_, err = olditv.CopyOut(t, oldAddr)
+	return 0, nil, err
 }
 
 // Alarm implements linux syscall alarm(2).
@@ -131,7 +113,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	var sev *linux.Sigevent
 	if sevp != 0 {
 		sev = &linux.Sigevent{}
-		if _, err = t.CopyIn(sevp, sev); err != nil {
+		if _, err = sev.CopyIn(t, sevp); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -141,7 +123,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOut(timerIDp, &id); err != nil {
+	if _, err := id.CopyOut(t, timerIDp); err != nil {
 		t.IntervalTimerDelete(id)
 		return 0, nil, err
 	}
@@ -157,7 +139,7 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	oldValAddr := args[3].Pointer()
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
@@ -165,9 +147,8 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		return 0, nil, err
 	}
 	if oldValAddr != 0 {
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
-			return 0, nil, err
-		}
+		_, err = oldVal.CopyOut(t, oldValAddr)
+		return 0, nil, err
 	}
 	return 0, nil, nil
 }
@@ -181,7 +162,7 @@ func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	if err != nil {
 		return 0, nil, err
 	}
-	_, err = t.CopyOut(curValAddr, &curVal)
+	_, err = curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index 34b03e4ee..cadd9d348 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -81,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	}
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
@@ -91,7 +91,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	tm, oldS := tf.SetTime(newS)
 	if oldValAddr != 0 {
 		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+		if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -116,6 +116,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	tm, s := tf.GetTime()
 	curVal := ktime.ItimerspecFromSetting(tm, s)
-	_, err := t.CopyOut(curValAddr, &curVal)
+	_, err := curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
index b3eb96a1c..6ddd30d5c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
@@ -18,6 +18,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -30,17 +31,19 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	case linux.ARCH_GET_FS:
 		addr := args[1].Pointer()
 		fsbase := t.Arch().TLS()
-		_, err := t.CopyOut(addr, uint64(fsbase))
-		if err != nil {
-			return 0, nil, err
+		switch t.Arch().Width() {
+		case 8:
+			if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil {
+				return 0, nil, err
+			}
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
-
 	case linux.ARCH_SET_FS:
 		fsbase := args[1].Uint64()
 		if !t.Arch().SetTLS(uintptr(fsbase)) {
 			return 0, nil, syserror.EPERM
 		}
-
 	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
 		t.Kernel().EmitUnimplementedEvent(t)
 		fallthrough
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index e9d702e8e..66c5974f5 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -46,7 +46,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	// Copy out the result.
 	va := args[0].Pointer()
-	_, err := t.CopyOut(va, u)
+	_, err := u.CopyOut(t, va)
 	return 0, nil, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 64696b438..9ee766552 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -44,6 +44,9 @@ go_library(
         "//pkg/context",
         "//pkg/fspath",
         "//pkg/gohacks",
+        "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
@@ -72,7 +75,5 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
-        "//tools/go_marshal/primitive",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go
index 42559bf69..6d0a38330 100644
--- a/pkg/sentry/syscalls/linux/vfs2/aio.go
+++ b/pkg/sentry/syscalls/linux/vfs2/aio.go
@@ -17,6 +17,7 @@ package vfs2
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -38,21 +39,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	for i := int32(0); i < nrEvents; i++ {
-		// Copy in the address.
-		cbAddrNative := t.Arch().Native(0)
-		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
-			if i > 0 {
-				// Some successful.
-				return uintptr(i), nil, nil
+		// Copy in the callback address.
+		var cbAddr usermem.Addr
+		switch t.Arch().Width() {
+		case 8:
+			var cbAddrP primitive.Uint64
+			if _, err := cbAddrP.CopyIn(t, addr); err != nil {
+				if i > 0 {
+					// Some successful.
+					return uintptr(i), nil, nil
+				}
+				// Nothing done.
+				return 0, nil, err
 			}
-			// Nothing done.
-			return 0, nil, err
+			cbAddr = usermem.Addr(cbAddrP)
+		default:
+			return 0, nil, syserror.ENOSYS
 		}
 
 		// Copy in this callback.
 		var cb linux.IOCallback
-		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
-		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+		if _, err := cb.CopyIn(t, cbAddr); err != nil {
 			if i > 0 {
 				// Some have been successful.
 				return uintptr(i), nil, nil
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index c62f03509..d0cbb77eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -24,7 +24,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -141,50 +140,26 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
-	// maxEvents), so that the buffer can be allocated on the stack.
+	// Allocate space for a few events on the stack for the common case in
+	// which we don't have too many events.
 	var (
-		events       [16]linux.EpollEvent
-		total        int
+		eventsArr    [16]linux.EpollEvent
 		ch           chan struct{}
 		haveDeadline bool
 		deadline     ktime.Time
 	)
 	for {
-		batchEvents := len(events)
-		if batchEvents > maxEvents {
-			batchEvents = maxEvents
-		}
-		n := ep.ReadEvents(events[:batchEvents])
-		maxEvents -= n
-		if n != 0 {
-			// Copy what we read out.
-			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+		events := ep.ReadEvents(eventsArr[:0], maxEvents)
+		if len(events) != 0 {
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
 			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
-			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
-			total += copiedEvents
-			if err != nil {
-				if total != 0 {
-					return uintptr(total), nil, nil
-				}
-				return 0, nil, err
-			}
-			// If we've filled the application's event buffer, we're done.
-			if maxEvents == 0 {
-				return uintptr(total), nil, nil
-			}
-			// Loop if we read a full batch, under the expectation that there
-			// may be more events to read.
-			if n == batchEvents {
-				continue
+			if copiedEvents != 0 {
+				return uintptr(copiedEvents), nil, nil
 			}
+			return 0, nil, err
 		}
-		// We get here if n != batchEvents. If we read any number of events
-		// (just now, or in a previous iteration of this loop), or if timeout
-		// is 0 (such that epoll_wait should be non-blocking), return the
-		// events we've read so far to the application.
-		if total != 0 || timeout == 0 {
-			return uintptr(total), nil, nil
+		if timeout == 0 {
+			return 0, nil, nil
 		}
 		// In the first iteration of this loop, register with the epoll
 		// instance for readability events, but then immediately continue the
@@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 				if err == syserror.ETIMEDOUT {
 					err = nil
 				}
-				// total must be 0 since otherwise we would have returned
-				// above.
 				return 0, nil, err
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
index 066ee0863..c8ce2aabc 100644
--- a/pkg/sentry/syscalls/linux/vfs2/execve.go
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -110,8 +110,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user
 	}
 
 	// Load the new TaskContext.
-	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
-	defer mntns.DecRef(t)
+	mntns := t.MountNamespaceVFS2()
 	wd := t.FSContext().WorkingDirectoryVFS2()
 	defer wd.DecRef(t)
 	remainingTraversals := uint(linux.MaxSymlinkTraversals)
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 4856554fe..36e89700e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -34,7 +34,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Note that Remove provides a reference on the file that we may use to
 	// flush. It is still active until we drop the final reference below
 	// (and other reference-holding operations complete).
-	_, file := t.FDTable().Remove(fd)
+	_, file := t.FDTable().Remove(t, fd)
 	if file == nil {
 		return 0, nil, syserror.EBADF
 	}
@@ -137,7 +137,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
-		err := t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 		return 0, nil, err
@@ -145,16 +145,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(file.StatusFlags()), nil, nil
 	case linux.F_SETFL:
 		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
-	case linux.F_SETPIPE_SZ:
-		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
-		if !ok {
-			return 0, nil, syserror.EBADF
-		}
-		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
-		if err != nil {
-			return 0, nil, err
-		}
-		return uintptr(n), nil, nil
 	case linux.F_GETOWN:
 		owner, hasOwner := getAsyncOwner(t, file)
 		if !hasOwner {
@@ -181,15 +171,25 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if !hasOwner {
 			return 0, nil, nil
 		}
-		_, err := t.CopyOut(args[2].Pointer(), &owner)
+		_, err := owner.CopyOut(t, args[2].Pointer())
 		return 0, nil, err
 	case linux.F_SETOWN_EX:
 		var owner linux.FOwnerEx
-		_, err := t.CopyIn(args[2].Pointer(), &owner)
+		_, err := owner.CopyIn(t, args[2].Pointer())
 		if err != nil {
 			return 0, nil, err
 		}
 		return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID)
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
 	case linux.F_GETPIPE_SZ:
 		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
 		if !ok {
@@ -286,7 +286,7 @@ func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescrip
 	// Copy in the lock request.
 	flockAddr := args[2].Pointer()
 	var flock linux.Flock
-	if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+	if _, err := flock.CopyIn(t, flockAddr); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
index 38778a388..2806c3f6f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -34,20 +35,20 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	// Handle ioctls that apply to all FDs.
 	switch args[1].Int() {
 	case linux.FIONCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: false,
 		})
 		return 0, nil, nil
 
 	case linux.FIOCLEX:
-		t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{
+		t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
 			CloseOnExec: true,
 		})
 		return 0, nil, nil
 
 	case linux.FIONBIO:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.StatusFlags()
@@ -60,7 +61,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FIOASYNC:
 		var set int32
-		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
 			return 0, nil, err
 		}
 		flags := file.StatusFlags()
@@ -82,12 +83,12 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 				who = owner.PID
 			}
 		}
-		_, err := t.CopyOut(args[2].Pointer(), &who)
+		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
 		return 0, nil, err
 
 	case linux.FIOSETOWN, linux.SIOCSPGRP:
 		var who int32
-		if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil {
+		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
 			return 0, nil, err
 		}
 		ownerType := int32(linux.F_OWNER_PID)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index dc05c2994..9d9dbf775 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -17,6 +17,7 @@ package vfs2
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		if err := file.ConfigureMMap(t, &opts); err != nil {
 			return 0, nil, err
 		}
+	} else if shared {
+		// Back shared anonymous mappings with an anonymous tmpfs file.
+		opts.Offset = 0
+		file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef(t)
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
 	}
 
 	rv, err := t.MemoryManager().MMap(t, opts)
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
index 4bd5c7ca2..769c9b92f 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mount.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -109,8 +109,8 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 	defer target.Release(t)
-
-	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	_, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+	return 0, nil, err
 }
 
 // Umount2 implements Linux syscall umount2(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
index 9b4848d9e..ee38fdca0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/pipe.go
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -51,9 +52,9 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
 	if err != nil {
 		return err
 	}
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
index 79ad64039..c22e4ce54 100644
--- a/pkg/sentry/syscalls/linux/vfs2/poll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -165,7 +165,7 @@ func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD
 
 	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
-		if _, err := t.CopyIn(addr, &pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
 			return nil, err
 		}
 	}
@@ -192,7 +192,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	// The poll entries are copied out regardless of whether
 	// any are set or not. This aligns with the Linux behavior.
 	if nfds > 0 && err == nil {
-		if _, err := t.CopyOut(addr, pfd); err != nil {
+		if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
 			return remainingTimeout, 0, err
 		}
 	}
@@ -205,7 +205,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy
 	set := make([]byte, nBytes)
 
 	if addr != 0 {
-		if _, err := t.CopyIn(addr, &set); err != nil {
+		if _, err := t.CopyInBytes(addr, set); err != nil {
 			return nil, err
 		}
 		// If we only use part of the last byte, mask out the extraneous bits.
@@ -332,19 +332,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 
 	// Copy updated vectors back.
 	if readFDs != 0 {
-		if _, err := t.CopyOut(readFDs, r); err != nil {
+		if _, err := t.CopyOutBytes(readFDs, r); err != nil {
 			return 0, err
 		}
 	}
 
 	if writeFDs != 0 {
-		if _, err := t.CopyOut(writeFDs, w); err != nil {
+		if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
 			return 0, err
 		}
 	}
 
 	if exceptFDs != 0 {
-		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+		if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
 			return 0, err
 		}
 	}
@@ -497,6 +497,12 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return n, nil, err
 }
 
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
 // Pselect implements linux syscall pselect(2).
 func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	nfds := int(args[0].Int()) // select(2) uses an int.
@@ -538,12 +544,6 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	return n, nil, err
 }
 
-// +marshal
-type sigSetWithSize struct {
-	sigsetAddr   uint64
-	sizeofSigset uint64
-}
-
 // copyTimespecInToDuration copies a Timespec from the untrusted app range,
 // validates it and converts it to a Duration.
 //
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
index 5e6eb13ba..1ee37e5a8 100644
--- a/pkg/sentry/syscalls/linux/vfs2/setstat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -346,7 +346,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opt
 		return nil
 	}
 	var times [2]linux.Timeval
-	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+	if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
 		return err
 	}
 	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
@@ -410,7 +410,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op
 		return nil
 	}
 	var times [2]linux.Timespec
-	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+	if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
 		return err
 	}
 	if times[0].Nsec != linux.UTIME_OMIT {
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a5032657a..7b33b3f59 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -19,6 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -30,8 +32,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
-	"gvisor.dev/gvisor/tools/go_marshal/primitive"
 )
 
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -66,10 +66,10 @@ const flagsOffset = 48
 const sizeOfInt32 = 4
 
 // messageHeader64Len is the length of a MessageHeader64 struct.
-var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
 
 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
-var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
 
 // baseRecvFlags are the flags that are accepted across recvmsg(2),
 // recvmmsg(2), and recvfrom(2).
@@ -77,6 +77,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT |
 
 // MessageHeader64 is the 64-bit representation of the msghdr struct used in
 // the recvmsg and sendmsg syscalls.
+//
+// +marshal
 type MessageHeader64 struct {
 	// Name is the optional pointer to a network address buffer.
 	Name uint64
@@ -105,30 +107,14 @@ type MessageHeader64 struct {
 
 // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
 // the recvmmsg and sendmmsg syscalls.
+//
+// +marshal
 type multipleMessageHeader64 struct {
 	msgHdr MessageHeader64
 	msgLen uint32
 	_      int32
 }
 
-// CopyInMessageHeader64 copies a message header from user to kernel memory.
-func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
-	b := t.CopyScratchBuffer(52)
-	if _, err := t.CopyInBytes(addr, b); err != nil {
-		return err
-	}
-
-	msg.Name = usermem.ByteOrder.Uint64(b[0:])
-	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
-	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
-	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
-	msg.Control = usermem.ByteOrder.Uint64(b[32:])
-	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
-	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
-
-	return nil
-}
-
 // CaptureAddress allocates memory for and copies a socket address structure
 // from the untrusted address space range.
 func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
@@ -147,10 +133,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte,
 // writeAddress writes a sockaddr structure and its length to an output buffer
 // in the unstrusted address space range. If the address is bigger than the
 // buffer, it is truncated.
-func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
 	// Get the buffer length.
 	var bufLen uint32
-	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+	if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
 		return err
 	}
 
@@ -159,7 +145,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Write the length unconditionally.
-	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+	if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
 		return err
 	}
 
@@ -172,7 +158,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user
 	}
 
 	// Copy as much of the address as will fit in the buffer.
-	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
+	addr.MarshalUnsafe(encodedAddr)
 	if bufLen > uint32(len(encodedAddr)) {
 		bufLen = uint32(len(encodedAddr))
 	}
@@ -250,9 +237,9 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, err
 	}
 
-	if _, err := t.CopyOut(addr, fds); err != nil {
+	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
 		for _, fd := range fds {
-			if _, file := t.FDTable().Remove(fd); file != nil {
+			if _, file := t.FDTable().Remove(t, fd); file != nil {
 				file.DecRef(t)
 			}
 		}
@@ -459,8 +446,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Read the length. Reject negative values.
-	optLen := int32(0)
-	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+	var optLen int32
+	if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
 		return 0, nil, err
 	}
 	if optLen < 0 {
@@ -474,7 +461,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	vLen := int32(binary.Size(v))
-	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+	if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil {
 		return 0, nil, err
 	}
 
@@ -736,7 +723,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -751,7 +738,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
 	// Capture the message header and io vectors.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -783,7 +770,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 
 		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
-			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+			if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 				return 0, err
 			}
 		}
@@ -820,17 +807,17 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla
 	}
 
 	// Copy the control data to the caller.
-	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+	if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
 		return 0, err
 	}
 	if len(controlData) > 0 {
-		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+		if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
 
 	// Copy out the flags to the caller.
-	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+	if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
 		return 0, err
 	}
 
@@ -999,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if !ok {
 			return 0, nil, syserror.EFAULT
 		}
-		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
 			break
 		}
 		count++
@@ -1014,7 +1001,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
 	// Capture the message header.
 	var msg MessageHeader64
-	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+	if _, err := msg.CopyIn(t, msgPtr); err != nil {
 		return 0, err
 	}
 
@@ -1025,7 +1012,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
 			return 0, syserror.ENOBUFS
 		}
 		controlData = make([]byte, msg.ControlLen)
-		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+		if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil {
 			return 0, err
 		}
 	}
@@ -1068,7 +1055,9 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio
 	// Call the syscall implementation.
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
-	if err != nil {
+	// Control messages should be released on error as well as for zero-length
+	// messages, which are discarded by the receiver.
+	if n == 0 || err != nil {
 		controlMessages.Release(t)
 	}
 	return uintptr(n), err
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 75bfa2c79..035e2a6b0 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -18,9 +18,12 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -42,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if count > int64(kernel.MAX_RW_COUNT) {
 		count = int64(kernel.MAX_RW_COUNT)
 	}
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
 
 	// Check for invalid flags.
 	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -88,7 +94,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if inFile.Options().DenyPRead {
 			return 0, nil, syserror.EINVAL
 		}
-		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+		if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
 			return 0, nil, err
 		}
 		if inOffset < 0 {
@@ -103,7 +109,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		if outFile.Options().DenyPWrite {
 			return 0, nil, syserror.EINVAL
 		}
-		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+		if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
 			return 0, nil, err
 		}
 		if outOffset < 0 {
@@ -131,21 +137,17 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		case inIsPipe && outIsPipe:
 			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
 		case inIsPipe:
+			n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
 			if outOffset != -1 {
-				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
 				outOffset += n
-			} else {
-				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
 			}
 		case outIsPipe:
+			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
 			if inOffset != -1 {
-				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
 				inOffset += n
-			} else {
-				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
 			}
 		default:
-			panic("not possible")
+			panic("at least one end of splice must be a pipe")
 		}
 
 		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
@@ -158,25 +160,26 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	// Copy updated offsets out.
 	if inOffsetPtr != 0 {
-		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
 			return 0, nil, err
 		}
 	}
 	if outOffsetPtr != 0 {
-		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+		if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
 			return 0, nil, err
 		}
 	}
 
-	if n == 0 {
-		return 0, nil, err
+	if n != 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
 	}
 
-	// On Linux, inotify behavior is not very consistent with splice(2). We try
-	// our best to emulate Linux for very basic calls to splice, where for some
-	// reason, events are generated for output files, but not input files.
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
 }
 
 // Tee implements Linux syscall tee(2).
@@ -192,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	if count > int64(kernel.MAX_RW_COUNT) {
 		count = int64(kernel.MAX_RW_COUNT)
 	}
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
 
 	// Check for invalid flags.
 	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
@@ -248,11 +254,20 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 			break
 		}
 	}
-	if n == 0 {
-		return 0, nil, err
+
+	if n != 0 {
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+		// If a partial write is completed, the error is dropped. Log it here.
+		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+			log.Debugf("tee completed a partial write with error: %v", err)
+			err = nil
+		}
 	}
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
 }
 
 // Sendfile implements linux system call sendfile(2).
@@ -301,9 +316,12 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if inFile.Options().DenyPRead {
 			return 0, nil, syserror.ESPIPE
 		}
-		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+		var offsetP primitive.Int64
+		if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
 			return 0, nil, err
 		}
+		offset = int64(offsetP)
+
 		if offset < 0 {
 			return 0, nil, syserror.EINVAL
 		}
@@ -341,16 +359,9 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	if outIsPipe {
 		for n < count {
 			var spliceN int64
+			spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
 			if offset != -1 {
-				spliceN, err = inFile.PRead(t, outPipeFD.IOSequence(count), offset, vfs.ReadOptions{})
 				offset += spliceN
-			} else {
-				spliceN, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
-			}
-			if spliceN == 0 && err == io.EOF {
-				// We reached the end of the file. Eat the error and exit the loop.
-				err = nil
-				break
 			}
 			n += spliceN
 			if err == syserror.ErrWouldBlock && !nonBlock {
@@ -371,19 +382,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			} else {
 				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
 			}
-			if readN == 0 && err == io.EOF {
-				// We reached the end of the file. Eat the error and exit the loop.
-				err = nil
-				break
-			}
 			n += readN
-			if err != nil {
-				break
-			}
 
 			// Write all of the bytes that we read. This may need
 			// multiple write calls to complete.
-			wbuf := buf[:n]
+			wbuf := buf[:readN]
 			for len(wbuf) > 0 {
 				var writeN int64
 				writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
@@ -392,12 +395,21 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					err = dw.waitForOut(t)
 				}
 				if err != nil {
-					// We didn't complete the write. Only
-					// report the bytes that were actually
-					// written, and rewind the offset.
+					// We didn't complete the write. Only report the bytes that were actually
+					// written, and rewind offsets as needed.
 					notWritten := int64(len(wbuf))
 					n -= notWritten
-					if offset != -1 {
+					if offset == -1 {
+						// We modified the offset of the input file itself during the read
+						// operation. Rewind it.
+						if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
+							// Log the error but don't return it, since the write has already
+							// completed successfully.
+							log.Warningf("failed to roll back input file offset: %v", seekErr)
+						}
+					} else {
+						// The sendfile call was provided an offset parameter that should be
+						// adjusted to reflect the number of bytes sent. Rewind it.
 						offset -= notWritten
 					}
 					break
@@ -414,18 +426,26 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	if offsetAddr != 0 {
 		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, offset); err != nil {
+		offsetP := primitive.Uint64(offset)
+		if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
 			return 0, nil, err
 		}
 	}
 
-	if n == 0 {
-		return 0, nil, err
+	if n != 0 {
+		inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
+		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
+
+		if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
+			// If a partial write is completed, the error is dropped. Log it here.
+			log.Debugf("sendfile completed a partial write with error: %v", err)
+			err = nil
+		}
 	}
 
-	inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
-	outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
-	return uintptr(n), nil, nil
+	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
+	// This is used only for debugging purposes.
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
 }
 
 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
index 7a26890ef..250870c03 100644
--- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -87,7 +87,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	}
 
 	var newVal linux.Itimerspec
-	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+	if _, err := newVal.CopyIn(t, newValAddr); err != nil {
 		return 0, nil, err
 	}
 	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
@@ -97,7 +97,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	tm, oldS := tfd.SetTime(newS)
 	if oldValAddr != 0 {
 		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
-		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+		if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -122,6 +122,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	tm, s := tfd.GetTime()
 	curVal := ktime.ItimerspecFromSetting(tm, s)
-	_, err := t.CopyOut(curValAddr, &curVal)
+	_, err := curVal.CopyOut(t, curValAddr)
 	return 0, nil, err
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index c576d9475..c50fd97eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -93,16 +93,16 @@ func Override() {
 	s.Table[165] = syscalls.Supported("mount", Mount)
 	s.Table[166] = syscalls.Supported("umount2", Umount2)
 	s.Table[187] = syscalls.Supported("readahead", Readahead)
-	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[188] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[191] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[194] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[197] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
@@ -163,16 +163,17 @@ func Override() {
 
 	// Override ARM64.
 	s = linux.ARM64
-	s.Table[5] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[2] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"})
+	s.Table[5] = syscalls.Supported("setxattr", SetXattr)
 	s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr)
 	s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr)
-	s.Table[8] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[8] = syscalls.Supported("getxattr", GetXattr)
 	s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr)
 	s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr)
-	s.Table[11] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[11] = syscalls.Supported("listxattr", ListXattr)
 	s.Table[12] = syscalls.Supported("llistxattr", Llistxattr)
 	s.Table[13] = syscalls.Supported("flistxattr", Flistxattr)
-	s.Table[14] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[14] = syscalls.Supported("removexattr", RemoveXattr)
 	s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr)
 	s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr)
 	s.Table[17] = syscalls.Supported("getcwd", Getcwd)
@@ -200,6 +201,7 @@ func Override() {
 	s.Table[44] = syscalls.Supported("fstatfs", Fstatfs)
 	s.Table[45] = syscalls.Supported("truncate", Truncate)
 	s.Table[46] = syscalls.Supported("ftruncate", Ftruncate)
+	s.Table[47] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil)
 	s.Table[48] = syscalls.Supported("faccessat", Faccessat)
 	s.Table[49] = syscalls.Supported("chdir", Chdir)
 	s.Table[50] = syscalls.Supported("fchdir", Fchdir)
@@ -221,12 +223,14 @@ func Override() {
 	s.Table[68] = syscalls.Supported("pwrite64", Pwrite64)
 	s.Table[69] = syscalls.Supported("preadv", Preadv)
 	s.Table[70] = syscalls.Supported("pwritev", Pwritev)
+	s.Table[71] = syscalls.Supported("sendfile", Sendfile)
 	s.Table[72] = syscalls.Supported("pselect", Pselect)
 	s.Table[73] = syscalls.Supported("ppoll", Ppoll)
 	s.Table[74] = syscalls.Supported("signalfd4", Signalfd4)
 	s.Table[76] = syscalls.Supported("splice", Splice)
 	s.Table[77] = syscalls.Supported("tee", Tee)
 	s.Table[78] = syscalls.Supported("readlinkat", Readlinkat)
+	s.Table[79] = syscalls.Supported("newfstatat", Newfstatat)
 	s.Table[80] = syscalls.Supported("fstat", Fstat)
 	s.Table[81] = syscalls.Supported("sync", Sync)
 	s.Table[82] = syscalls.Supported("fsync", Fsync)
@@ -251,8 +255,10 @@ func Override() {
 	s.Table[210] = syscalls.Supported("shutdown", Shutdown)
 	s.Table[211] = syscalls.Supported("sendmsg", SendMsg)
 	s.Table[212] = syscalls.Supported("recvmsg", RecvMsg)
+	s.Table[213] = syscalls.Supported("readahead", Readahead)
 	s.Table[221] = syscalls.Supported("execve", Execve)
 	s.Table[222] = syscalls.Supported("mmap", Mmap)
+	s.Table[223] = syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil)
 	s.Table[242] = syscalls.Supported("accept4", Accept4)
 	s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg)
 	s.Table[267] = syscalls.Supported("syncfs", Syncfs)
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index ef99246ed..e05723ef9 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -26,8 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// Listxattr implements Linux syscall listxattr(2).
-func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// ListXattr implements Linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return listxattr(t, args, followFinalSymlink)
 }
 
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release(t)
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+	names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef(t)
 
-	names, err := file.Listxattr(t, uint64(size))
+	names, err := file.ListXattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return uintptr(n), nil, nil
 }
 
-// Getxattr implements Linux syscall getxattr(2).
-func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// GetXattr implements Linux syscall getxattr(2).
+func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getxattr(t, args, followFinalSymlink)
 }
 
@@ -116,7 +116,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+	value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
 		Name: name,
 		Size: uint64(size),
 	})
@@ -148,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+	value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return uintptr(n), nil, nil
 }
 
-// Setxattr implements Linux syscall setxattr(2).
-func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// SetXattr implements Linux syscall setxattr(2).
+func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, setxattr(t, args, followFinalSymlink)
 }
 
@@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return err
 	}
 
-	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+	return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
@@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+	return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
 	})
 }
 
-// Removexattr implements Linux syscall removexattr(2).
-func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+// RemoveXattr implements Linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, removexattr(t, args, followFinalSymlink)
 }
 
@@ -269,7 +269,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy
 		return err
 	}
 
-	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+	return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
 }
 
 // Fremovexattr implements Linux syscall fremovexattr(2).
@@ -288,7 +288,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Removexattr(t, name)
+	return 0, nil, file.RemoveXattr(t, name)
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index ab1d140d2..5ed6726ab 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 {
 	}
 	if memSize < used {
 		memSize = used
-		// Bump totalSize to the next largest power of 2, if one exists, so
+		// Bump memSize to the next largest power of 2, if one exists, so
 		// that MemFree isn't 0.
 		if msb := bits.MostSignificantOne64(memSize); msb < 63 {
 			memSize = uint64(1) << (uint(msb) + 1)
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 642769e7c..440c9307c 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -27,6 +27,39 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "file_description_refs",
+    out = "file_description_refs.go",
+    package = "vfs",
+    prefix = "FileDescription",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "FileDescription",
+    },
+)
+
+go_template_instance(
+    name = "mount_namespace_refs",
+    out = "mount_namespace_refs.go",
+    package = "vfs",
+    prefix = "MountNamespace",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "MountNamespace",
+    },
+)
+
+go_template_instance(
+    name = "filesystem_refs",
+    out = "filesystem_refs.go",
+    package = "vfs",
+    prefix = "Filesystem",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "Filesystem",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -40,29 +73,34 @@ go_library(
         "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
+        "file_description_refs.go",
         "filesystem.go",
         "filesystem_impl_util.go",
+        "filesystem_refs.go",
         "filesystem_type.go",
         "inotify.go",
         "lock.go",
         "mount.go",
+        "mount_namespace_refs.go",
         "mount_unsafe.go",
         "options.go",
         "pathname.go",
         "permissions.go",
         "resolving_path.go",
+        "save_restore.go",
         "vfs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/log",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 4b9faf2ea..5aad31b78 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -184,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's
     -   File locking
 
     -   `O_ASYNC`
-
--   Reference counts in the `vfs` package do not use the `refs` package since
-    `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
-    count, resulting in considerable cache bloat. 24 bytes of this overhead is
-    for weak reference support, which have poor performance and will not be used
-    by VFS2. The remaining 40 bytes is to store a descriptive string and stack
-    trace for reference leak checking; we can support reference leak checking
-    without incurring this space overhead by including the applicable
-    information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 5a0e3e6b5..7ad0eaf86 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -52,6 +52,8 @@ const (
 )
 
 // anonFilesystemType implements FilesystemType.
+//
+// +stateify savable
 type anonFilesystemType struct{}
 
 // GetFilesystem implements FilesystemType.GetFilesystem.
@@ -59,22 +61,28 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au
 	panic("cannot instaniate an anon filesystem")
 }
 
-// Name implemenents FilesystemType.Name.
+// Name implements FilesystemType.Name.
 func (anonFilesystemType) Name() string {
 	return "none"
 }
 
+// Release implemenents FilesystemType.Release.
+func (anonFilesystemType) Release(ctx context.Context) {}
+
 // anonFilesystem is the implementation of FilesystemImpl that backs
 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
 //
 // Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
 // methods that would require an anonDentry to be a directory return ENOTDIR.
+//
+// +stateify savable
 type anonFilesystem struct {
 	vfsfs Filesystem
 
 	devMinor uint32
 }
 
+// +stateify savable
 type anonDentry struct {
 	vfsd Dentry
 
@@ -245,32 +253,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements FilesystemImpl.ListXattrAt.
+func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
 	return nil, nil
 }
 
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+// GetXattrAt implements FilesystemImpl.GetXattrAt.
+func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+// SetXattrAt implements FilesystemImpl.SetXattrAt.
+func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
 	return syserror.EPERM
 }
 
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
+func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
 	if !rp.Done() {
 		return syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index c9e724fef..97018651f 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	return nil
 }
 
+type mountNamespaceContext struct {
+	context.Context
+	mntns *MountNamespace
+}
+
+// Value implements Context.Value.
+func (mc mountNamespaceContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxMountNamespace:
+		mc.mntns.IncRef()
+		return mc.mntns
+	default:
+		return mc.Context.Value(key)
+	}
+}
+
+// WithMountNamespace returns a copy of ctx with the given MountNamespace.
+func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context {
+	return &mountNamespaceContext{
+		Context: ctx,
+		mntns:   mntns,
+	}
+}
+
 // RootFromContext returns the VFS root used by ctx. It takes a reference on
 // the returned VirtualDentry. If ctx does not have a specific VFS root,
 // RootFromContext returns a zero-value VirtualDentry.
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index bc7ea93ea..320ab7ce1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -89,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl {
 // DentryImpl contains implementation details for a Dentry. Implementations of
 // DentryImpl should contain their associated Dentry by value as their first
 // field.
+//
+// +stateify savable
 type DentryImpl interface {
 	// IncRef increments the Dentry's reference count. A Dentry with a non-zero
 	// reference count must remain coherent with the state of the filesystem.
@@ -242,8 +244,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
 // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
-// Preconditions: If to is not nil, it must be a child Dentry from the same
-// Filesystem. from != to.
+// Preconditions:
+// * If to is not nil, it must be a child Dentry from the same Filesystem.
+// * from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 1e9dffc8f..dde2ad79b 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -22,6 +22,8 @@ import (
 )
 
 // DeviceKind indicates whether a device is a block or character device.
+//
+// +stateify savable
 type DeviceKind uint32
 
 const (
@@ -44,6 +46,7 @@ func (kind DeviceKind) String() string {
 	}
 }
 
+// +stateify savable
 type devTuple struct {
 	kind  DeviceKind
 	major uint32
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 1b5af9f73..a98aac52b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -27,6 +27,8 @@ import (
 var epollCycleMu sync.Mutex
 
 // EpollInstance represents an epoll instance, as described by epoll(7).
+//
+// +stateify savable
 type EpollInstance struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
@@ -38,11 +40,11 @@ type EpollInstance struct {
 
 	// interest is the set of file descriptors that are registered with the
 	// EpollInstance for monitoring. interest is protected by interestMu.
-	interestMu sync.Mutex
+	interestMu sync.Mutex `state:"nosave"`
 	interest   map[epollInterestKey]*epollInterest
 
 	// mu protects fields in registered epollInterests.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// ready is the set of file descriptors that may be "ready" for I/O. Note
 	// that this must be an ordered list, not a map: "If more than maxevents
@@ -55,6 +57,7 @@ type EpollInstance struct {
 	ready epollInterestList
 }
 
+// +stateify savable
 type epollInterestKey struct {
 	// file is the registered FileDescription. No reference is held on file;
 	// instead, when the last reference is dropped, FileDescription.DecRef()
@@ -67,9 +70,11 @@ type epollInterestKey struct {
 }
 
 // epollInterest represents an EpollInstance's interest in a file descriptor.
+//
+// +stateify savable
 type epollInterest struct {
 	// epoll is the owning EpollInstance. epoll is immutable.
-	epoll *EpollInstance
+	epoll *EpollInstance `state:"wait"`
 
 	// key is the file to which this epollInterest applies. key is immutable.
 	key epollInterestKey
@@ -331,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) {
 	ep.mu.Unlock()
 }
 
-// ReadEvents reads up to len(events) ready events into events and returns the
-// number of events read.
-//
-// Preconditions: len(events) != 0.
-func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+// ReadEvents appends up to maxReady events to events and returns the updated
+// slice of events.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
 	i := 0
 	// Hot path: avoid defer.
 	ep.mu.Lock()
@@ -368,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 			requeue.PushBack(epi)
 		}
 		// Report ievents.
-		events[i] = linux.EpollEvent{
+		events = append(events, linux.EpollEvent{
 			Events: ievents.ToLinux(),
 			Data:   epi.userData,
-		}
+		})
 		i++
-		if i == len(events) {
+		if i == maxEvents {
 			break
 		}
 	}
 	ep.ready.PushBackList(&requeue)
 	ep.mu.Unlock()
-	return i
+	return events
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index dcafffe57..546e445aa 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -37,13 +37,13 @@ import (
 // FileDescription methods require that a reference is held.
 //
 // FileDescription is analogous to Linux's struct file.
+//
+// +stateify savable
 type FileDescription struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FileDescriptionRefs
 
 	// flagsMu protects statusFlags and asyncHandler below.
-	flagsMu sync.Mutex
+	flagsMu sync.Mutex `state:"nosave"`
 
 	// statusFlags contains status flags, "initialized by open(2) and possibly
 	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
@@ -58,7 +58,7 @@ type FileDescription struct {
 
 	// epolls is the set of epollInterests registered for this FileDescription.
 	// epolls is protected by epollMu.
-	epollMu sync.Mutex
+	epollMu sync.Mutex `state:"nosave"`
 	epolls  map[*epollInterest]struct{}
 
 	// vd is the filesystem location at which this FileDescription was opened.
@@ -90,6 +90,8 @@ type FileDescription struct {
 }
 
 // FileDescriptionOptions contains options to FileDescription.Init().
+//
+// +stateify savable
 type FileDescriptionOptions struct {
 	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
 	AllowDirectIO bool
@@ -103,7 +105,7 @@ type FileDescriptionOptions struct {
 
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
-	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
 	// the corresponding FilesystemImpl methods instead of the corresponding
 	// FileDescriptionImpl methods.
 	//
@@ -131,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 		}
 	}
 
-	fd.refs = 1
+	fd.EnableLeakCheck()
 
 	// Remove "file creation flags" to mirror the behavior from file.f_flags in
 	// fs/open.c:do_dentry_open.
@@ -149,30 +151,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
 	return nil
 }
 
-// IncRef increments fd's reference count.
-func (fd *FileDescription) IncRef() {
-	atomic.AddInt64(&fd.refs, 1)
-}
-
-// TryIncRef increments fd's reference count and returns true. If fd's
-// reference count is already zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fd.
-func (fd *FileDescription) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fd.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+	fd.FileDescriptionRefs.DecRef(func() {
 		// Unregister fd from all epoll instances.
 		fd.epollMu.Lock()
 		epolls := fd.epolls
@@ -202,21 +183,12 @@ func (fd *FileDescription) DecRef(ctx context.Context) {
 		}
 		fd.vd.DecRef(ctx)
 		fd.flagsMu.Lock()
-		// TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1.
 		if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
 			fd.asyncHandler.Unregister(fd)
 		}
 		fd.asyncHandler = nil
 		fd.flagsMu.Unlock()
-	} else if refs < 0 {
-		panic("FileDescription.DecRef() called without holding a reference")
-	}
-}
-
-// Refs returns the current number of references. The returned count
-// is inherently racy and is unsafe to use without external synchronization.
-func (fd *FileDescription) Refs() int64 {
-	return atomic.LoadInt64(&fd.refs)
+	})
 }
 
 // Mount returns the mount on which fd was opened. It does not take a reference
@@ -357,6 +329,9 @@ type FileDescriptionImpl interface {
 	// Allocate grows the file to offset + length bytes.
 	// Only mode == 0 is supported currently.
 	//
+	// Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
+	// other files where it is not supported.
+	//
 	// Preconditions: The FileDescription was opened for writing.
 	Allocate(ctx context.Context, mode, offset, length uint64) error
 
@@ -371,8 +346,9 @@ type FileDescriptionImpl interface {
 	//
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for reading.
-	// FileDescriptionOptions.DenyPRead == false.
+	// Preconditions:
+	// * The FileDescription was opened for reading.
+	// * FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -403,8 +379,9 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PWrite returns
 	// EOPNOTSUPP.
 	//
-	// Preconditions: The FileDescription was opened for writing.
-	// FileDescriptionOptions.DenyPWrite == false.
+	// Preconditions:
+	// * The FileDescription was opened for writing.
+	// * FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -449,19 +426,19 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context, size uint64) ([]string, error)
+	// ListXattr returns all extended attribute names for the file.
+	ListXattr(ctx context.Context, size uint64) ([]string, error)
 
-	// Getxattr returns the value associated with the given extended attribute
+	// GetXattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+	GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)
 
-	// Setxattr changes the value associated with the given extended attribute
+	// SetXattr changes the value associated with the given extended attribute
 	// for the file.
-	Setxattr(ctx context.Context, opts SetxattrOptions) error
+	SetXattr(ctx context.Context, opts SetXattrOptions) error
 
-	// Removexattr removes the given extended attribute from the file.
-	Removexattr(ctx context.Context, name string) error
+	// RemoveXattr removes the given extended attribute from the file.
+	RemoveXattr(ctx context.Context, name string) error
 
 	// LockBSD tries to acquire a BSD-style advisory file lock.
 	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
@@ -477,6 +454,8 @@ type FileDescriptionImpl interface {
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
+//
+// +stateify savable
 type Dirent struct {
 	// Name is the filename.
 	Name string
@@ -664,25 +643,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
-// Listxattr returns all extended attribute names for the file represented by
+// ListXattr returns all extended attribute names for the file represented by
 // fd.
 //
 // If the size of the list (including a NUL terminating byte after every entry)
 // would exceed size, ERANGE may be returned. Note that implementations
 // are free to ignore size entirely and return without error). In all cases,
 // if size is 0, the list should be returned without error, regardless of size.
-func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(ctx, rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx, size)
+	names, err := fd.impl.ListXattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -693,57 +672,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string
 	return names, err
 }
 
-// Getxattr returns the value associated with the given extended attribute for
+// GetXattr returns the value associated with the given extended attribute for
 // the file represented by fd.
 //
 // If the size of the return value exceeds opts.Size, ERANGE may be returned
 // (note that implementations are free to ignore opts.Size entirely and return
 // without error). In all cases, if opts.Size is 0, the value should be
 // returned without error, regardless of size.
-func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, *opts)
+	return fd.impl.GetXattr(ctx, *opts)
 }
 
-// Setxattr changes the value associated with the given extended attribute for
+// SetXattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, *opts)
+	return fd.impl.SetXattr(ctx, *opts)
 }
 
-// Removexattr removes the given extended attribute from the file represented
+// RemoveXattr removes the given extended attribute from the file represented
 // by fd.
-func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		vfsObj.putResolvingPath(ctx, rp)
 		return err
 	}
-	return fd.impl.Removexattr(ctx, name)
+	return fd.impl.RemoveXattr(ctx, name)
 }
 
 // SyncFS instructs the filesystem containing fd to execute the semantics of
@@ -845,3 +824,45 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn
 	}
 	return fd.asyncHandler
 }
+
+// FileReadWriteSeeker is a helper struct to pass a FileDescription as
+// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
+type FileReadWriteSeeker struct {
+	FD    *FileDescription
+	Ctx   context.Context
+	ROpts ReadOptions
+	WOpts WriteOptions
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
+	return int(n), err
+}
+
+// Read implements io.ReadWriteSeeker.Read.
+func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
+	return int(n), err
+}
+
+// Seek implements io.ReadWriteSeeker.Seek.
+func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+	return f.FD.Seek(f.Ctx, offset, int32(whence))
+}
+
+// WriteAt implements io.WriterAt.WriteAt.
+func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
+	dst := usermem.BytesIOSequence(p)
+	n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
+	return int(n), err
+}
+
+// Write implements io.ReadWriteSeeker.Write.
+func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
+	buf := usermem.BytesIOSequence(p)
+	n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
+	return int(n), err
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 6b8b4ad49..48ca9de44 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -42,6 +42,8 @@ import (
 // FileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
 // methods with default behavior analogous to Linux's.
+//
+// +stateify savable
 type FileDescriptionDefaultImpl struct{}
 
 // OnClose implements FileDescriptionImpl.OnClose analogously to
@@ -57,7 +59,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
 }
 
 // Allocate implements FileDescriptionImpl.Allocate analogously to
-// fallocate called on regular file, directory or FIFO in Linux.
+// fallocate called on an invalid type of file in Linux.
+//
+// Note that directories can rely on this implementation even though they
+// should technically return EISDIR. Allocate should never be called for a
+// directory, because it requires a writable fd.
 func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
 	return syserror.ENODEV
 }
@@ -134,34 +140,36 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
-// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// ListXattr implements FileDescriptionImpl.ListXattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	// This isn't exactly accurate; see FileDescription.Listxattr.
+func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.ListXattr.
 	return nil, syserror.ENOTSUP
 }
 
-// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// GetXattr implements FileDescriptionImpl.GetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
-// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// SetXattr implements FileDescriptionImpl.SetXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
 	return syserror.ENOTSUP
 }
 
-// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
 	return syserror.ENOTSUP
 }
 
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
+//
+// +stateify savable
 type DirectoryFileDescriptionDefaultImpl struct{}
 
 // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
@@ -192,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
 // DentryMetadataFileDescriptionImpl may be embedded by implementations of
 // FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
 // true to obtain implementations of Stat and SetStat that panic.
+//
+// +stateify savable
 type DentryMetadataFileDescriptionImpl struct{}
 
 // Stat implements FileDescriptionImpl.Stat.
@@ -206,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
 
 // DynamicBytesSource represents a data source for a
 // DynamicBytesFileDescriptionImpl.
+//
+// +stateify savable
 type DynamicBytesSource interface {
 	// Generate writes the file's contents to buf.
 	Generate(ctx context.Context, buf *bytes.Buffer) error
 }
 
 // StaticData implements DynamicBytesSource over a static string.
+//
+// +stateify savable
 type StaticData struct {
 	Data string
 }
@@ -238,14 +252,24 @@ type WritableDynamicBytesSource interface {
 //
 // DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
 // use.
+//
+// +stateify savable
 type DynamicBytesFileDescriptionImpl struct {
 	data     DynamicBytesSource // immutable
-	mu       sync.Mutex         // protects the following fields
-	buf      bytes.Buffer
+	mu       sync.Mutex         `state:"nosave"` // protects the following fields
+	buf      bytes.Buffer       `state:".([]byte)"`
 	off      int64
 	lastRead int64 // offset at which the last Read, PRead, or Seek ended
 }
 
+func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
+	return fd.buf.Bytes()
+}
+
+func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
+	fd.buf.Write(p)
+}
+
 // SetDataSource must be called exactly once on fd before first use.
 func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
 	fd.data = data
@@ -378,6 +402,8 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
 
 // LockFD may be used by most implementations of FileDescriptionImpl.Lock*
 // functions. Caller must call Init().
+//
+// +stateify savable
 type LockFD struct {
 	locks *FileLocks
 }
@@ -405,6 +431,8 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
 
 // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
 // returning ENOLCK.
+//
+// +stateify savable
 type NoLockFD struct{}
 
 // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index df3758fd1..c93d94634 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -15,8 +15,6 @@
 package vfs
 
 import (
-	"sync/atomic"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
@@ -34,9 +32,7 @@ import (
 //
 // +stateify savable
 type Filesystem struct {
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
+	FilesystemRefs
 
 	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
 	// immutable.
@@ -52,7 +48,7 @@ type Filesystem struct {
 
 // Init must be called before first use of fs.
 func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
-	fs.refs = 1
+	fs.EnableLeakCheck()
 	fs.vfs = vfsObj
 	fs.fsType = fsType
 	fs.impl = impl
@@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl {
 	return fs.impl
 }
 
-// IncRef increments fs' reference count.
-func (fs *Filesystem) IncRef() {
-	if atomic.AddInt64(&fs.refs, 1) <= 1 {
-		panic("Filesystem.IncRef() called without holding a reference")
-	}
-}
-
-// TryIncRef increments fs' reference count and returns true. If fs' reference
-// count is zero, TryIncRef does nothing and returns false.
-//
-// TryIncRef does not require that a reference is held on fs.
-func (fs *Filesystem) TryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&fs.refs)
-		if refs <= 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef(ctx context.Context) {
-	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+	fs.FilesystemRefs.DecRef(func() {
 		fs.vfs.filesystemsMu.Lock()
 		delete(fs.vfs.filesystems, fs)
 		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release(ctx)
-	} else if refs < 0 {
-		panic("Filesystem.decRef() called without holding a reference")
-	}
+	})
 }
 
 // FilesystemImpl contains implementation details for a Filesystem.
@@ -212,8 +183,9 @@ type FilesystemImpl interface {
 	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
 	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If LinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -231,8 +203,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the new directory would be created has been
 	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MkdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -253,8 +226,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the file would be created has been removed
 	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If MknodAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -345,11 +319,12 @@ type FilesystemImpl interface {
 	// - If renaming would replace a non-empty directory, RenameAt returns
 	// ENOTEMPTY.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
-	// previous call to
-	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
-	// not "." or "..".
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
+	// * oldParentVD.Dentry() was obtained from a previous call to
+	//   oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt().
+	// * oldName is not "." or "..".
 	//
 	// Postconditions: If RenameAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -372,8 +347,9 @@ type FilesystemImpl interface {
 	// - If the file at rp exists but is not a directory, RmdirAt returns
 	// ENOTDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If RmdirAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -410,8 +386,9 @@ type FilesystemImpl interface {
 	// - If the directory in which the symbolic link would be created has been
 	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If SymlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -431,33 +408,34 @@ type FilesystemImpl interface {
 	//
 	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
 	//
-	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink().
+	// Preconditions:
+	// * !rp.Done().
+	// * For the final path component in rp, !rp.ShouldFollowSymlink().
 	//
 	// Postconditions: If UnlinkAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// ListxattrAt returns all extended attribute names for the file at rp.
+	// ListXattrAt returns all extended attribute names for the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// ListxattrAt returns ENOTSUP.
+	// ListXattrAt returns ENOTSUP.
 	//
 	// - If the size of the list (including a NUL terminating byte after every
 	// entry) would exceed size, ERANGE may be returned. Note that
 	// implementations are free to ignore size entirely and return without
 	// error). In all cases, if size is 0, the list should be returned without
 	// error, regardless of size.
-	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+	ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
-	// GetxattrAt returns the value associated with the given extended
+	// GetXattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// - If extended attributes are not supported by the filesystem, GetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If an extended attribute named opts.Name does not exist, ENODATA is
@@ -467,30 +445,30 @@ type FilesystemImpl interface {
 	// returned (note that implementations are free to ignore opts.Size entirely
 	// and return without error). In all cases, if opts.Size is 0, the value
 	// should be returned without error, regardless of size.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+	GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)
 
-	// SetxattrAt changes the value associated with the given extended
+	// SetXattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
 	//
 	// Errors:
 	//
-	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// - If extended attributes are not supported by the filesystem, SetXattrAt
 	// returns ENOTSUP.
 	//
 	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
 	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
 	// ENODATA is returned.
-	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+	SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error
 
-	// RemovexattrAt removes the given extended attribute from the file at rp.
+	// RemoveXattrAt removes the given extended attribute from the file at rp.
 	//
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// RemovexattrAt returns ENOTSUP.
+	// RemoveXattrAt returns ENOTSUP.
 	//
 	// - If name does not exist, ENODATA is returned.
-	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+	RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
 	//
@@ -528,6 +506,8 @@ type FilesystemImpl interface {
 
 // PrependPathAtVFSRootError is returned by implementations of
 // FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+//
+// +stateify savable
 type PrependPathAtVFSRootError struct{}
 
 // Error implements error.Error.
@@ -538,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string {
 // PrependPathAtNonMountRootError is returned by implementations of
 // FilesystemImpl.PrependPath() when they encounter an independent ancestor
 // Dentry that is not the Mount root.
+//
+// +stateify savable
 type PrependPathAtNonMountRootError struct{}
 
 // Error implements error.Error.
@@ -548,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string {
 // PrependPathSyntheticError is returned by implementations of
 // FilesystemImpl.PrependPath() for which prepended names do not represent real
 // paths.
+//
+// +stateify savable
 type PrependPathSyntheticError struct{}
 
 // Error implements error.Error.
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..2620cf975 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,9 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericStatFS returns a statfs struct filled with the common fields for a
+// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs().
+func GenericStatFS(fsMagic uint64) linux.Statfs {
+	return linux.Statfs{
+		Type:       fsMagic,
+		BlockSize:  usermem.PageSize,
+		NameLength: linux.NAME_MAX,
+	}
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f2298f7f6..9d54cc4ed 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -33,6 +33,9 @@ type FilesystemType interface {
 
 	// Name returns the name of this FilesystemType.
 	Name() string
+
+	// Release releases all resources held by this FilesystemType.
+	Release(ctx context.Context)
 }
 
 // GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
@@ -55,10 +58,13 @@ type registeredFilesystemType struct {
 
 // RegisterFilesystemTypeOptions contains options to
 // VirtualFilesystem.RegisterFilesystem().
+//
+// +stateify savable
 type RegisterFilesystemTypeOptions struct {
-	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
-	// for which MountOptions.InternalMount == false to use this filesystem
-	// type.
+	// AllowUserMount determines whether users are allowed to mount a file system
+	// of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
+	// to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false
+	// to use this filesystem type.
 	AllowUserMount bool
 
 	// If AllowUserList is true, make this filesystem type visible in
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
index e7da49faa..833db213f 100644
--- a/pkg/sentry/vfs/g3doc/inotify.md
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -28,9 +28,9 @@ The set of all watches held on a single file (i.e., the watch target) is stored
 in vfs.Watches. Each watch will belong to a different inotify instance (an
 instance can only have one watch on any watch target). The watches are stored in
 a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
-to a single file will all share the same vfs.Watches. Activity on the target
-causes its vfs.Watches to generate notifications on its watches’ inotify
-instances.
+to a single file will all share the same vfs.Watches (with the exception of the
+gofer filesystem, described in a later section). Activity on the target causes
+its vfs.Watches to generate notifications on its watches’ inotify instances.
 
 ### vfs.Watch
 
@@ -103,12 +103,12 @@ inotify:
     unopened p9 file (and possibly an open FID), through which the Sentry
     interacts with the gofer.
     *   *Solution:* Because there is no inode structure stored in the sandbox,
-        inotify watches must be held on the dentry. This would be an issue in
-        the presence of hard links, where multiple dentries would need to share
-        the same set of watches, but in VFS2, we do not support the internal
-        creation of hard links on gofer fs. As a result, we make the assumption
-        that every dentry corresponds to a unique inode. However, the next point
-        raises an issue with this assumption:
+        inotify watches must be held on the dentry. For the purposes of inotify,
+        we assume that every dentry corresponds to a unique inode, which may
+        cause unexpected behavior in the presence of hard links, where multiple
+        dentries should share the same set of watches. Indeed, it is impossible
+        for us to be absolutely sure whether dentries correspond to the same
+        file or not, due to the following point:
 *   **The Sentry cannot always be aware of hard links on the remote
     filesystem.** There is no way for us to confirm whether two files on the
     remote filesystem are actually links to the same inode. QIDs and inodes are
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 8882fa84a..ba6e6ed49 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -27,6 +27,8 @@ import (
 )
 
 // Dentry is a required type parameter that is a struct with the given fields.
+//
+// +stateify savable
 type Dentry struct {
 	// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
 	vfsd vfs.Dentry
@@ -69,7 +71,7 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath
 		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
 			return vfs.PrependPathAtVFSRootError{}
 		}
-		if &d.vfsd == mnt.Root() {
+		if mnt != nil && &d.vfsd == mnt.Root() {
 			return nil
 		}
 		if d.parent == nil {
@@ -79,3 +81,12 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath
 		d = d.parent
 	}
 }
+
+// DebugPathname returns a pathname to d relative to its filesystem root.
+// DebugPathname does not correspond to any Linux function; it's used to
+// generate dentry pathnames for debugging.
+func DebugPathname(d *Dentry) string {
+	var b fspath.Builder
+	_ = PrependPath(vfs.VirtualDentry{}, nil, d, &b)
+	return b.String()
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index aff220a61..3f0b8f45b 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16
 //
 // The way events are labelled appears somewhat arbitrary, but they must match
 // Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+//
+// +stateify savable
 type EventType uint8
 
 // PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
index 42666eebf..1ff202f2a 100644
--- a/pkg/sentry/vfs/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -12,11 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package lock provides POSIX and BSD style file locking for VFS2 file
-// implementations.
-//
-// The actual implementations can be found in the lock package under
-// sentry/fs/lock.
 package vfs
 
 import (
@@ -33,6 +28,8 @@ import (
 // Note that in Linux these two types of locks are _not_ cooperative, because
 // race and deadlock conditions make merging them prohibitive. We do the same
 // and keep them oblivious to each other.
+//
+// +stateify savable
 type FileLocks struct {
 	// bsd is a set of BSD-style advisory file wide locks, see flock(2).
 	bsd fslock.Locks
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
index cc1e7d764..638b5d830 100644
--- a/pkg/sentry/vfs/memxattr/xattr.go
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct {
 	xattrs map[string]string
 }
 
-// Getxattr returns the value at 'name'.
-func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+// GetXattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) {
 	x.mu.RLock()
 	value, ok := x.xattrs[opts.Name]
 	x.mu.RUnlock()
@@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string,
 	return value, nil
 }
 
-// Setxattr sets 'value' at 'name'.
-func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+// SetXattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if x.xattrs == nil {
@@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
 	return nil
 }
 
-// Listxattr returns all names in xattrs.
-func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+// ListXattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) {
 	// Keep track of the size of the buffer needed in listxattr(2) for the list.
 	listSize := 0
 	x.mu.RLock()
@@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
 	return names, nil
 }
 
-// Removexattr removes the xattr at 'name'.
-func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+// RemoveXattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) RemoveXattr(name string) error {
 	x.mu.Lock()
 	defer x.mu.Unlock()
 	if _, ok := x.xattrs[name]; !ok {
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 67dfba986..d452d2cda 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -24,6 +24,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -46,8 +47,9 @@ import (
 // +stateify savable
 type Mount struct {
 	// vfs, fs, root are immutable. References are held on fs and root.
+	// Note that for a disconnected mount, root may be nil.
 	//
-	// Invariant: root belongs to fs.
+	// Invariant: if not nil, root belongs to fs.
 	vfs  *VirtualFilesystem
 	fs   *Filesystem
 	root *Dentry
@@ -65,7 +67,7 @@ type Mount struct {
 	//
 	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
 	// key.parent.fs.
-	key mountKey
+	key mountKey `state:".(VirtualDentry)"`
 
 	// ns is the namespace in which this Mount was mounted. ns is protected by
 	// VirtualFilesystem.mountMu.
@@ -105,6 +107,9 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
 	if opts.ReadOnly {
 		mnt.setReadOnlyLocked(true)
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(mnt, "vfs.Mount")
+	}
 	return mnt
 }
 
@@ -126,16 +131,14 @@ func (mnt *Mount) Options() MountOptions {
 //
 // +stateify savable
 type MountNamespace struct {
+	MountNamespaceRefs
+
 	// Owner is the usernamespace that owns this mount namespace.
 	Owner *auth.UserNamespace
 
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
-	// refs is the reference count. refs is accessed using atomic memory
-	// operations.
-	refs int64
-
 	// mountpoints maps all Dentries which are mount points in this namespace
 	// to the number of Mounts for which they are mount points. mountpoints is
 	// protected by VirtualFilesystem.mountMu.
@@ -154,22 +157,22 @@ type MountNamespace struct {
 // NewMountNamespace returns a new mount namespace with a root filesystem
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
 		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return nil, err
 	}
 	mntns := &MountNamespace{
 		Owner:       creds.UserNamespace,
-		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
-	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+	mntns.EnableLeakCheck()
+	mntns.root = newMount(vfs, fs, root, mntns, opts)
 	return mntns, nil
 }
 
@@ -263,16 +266,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr
 }
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
+//
+// This method returns the mounted Mount without a reference, for convenience
+// during VFS setup when there is no chance of racing with unmount.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer mnt.DecRef(ctx)
 	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
-		return err
+		return nil, err
 	}
-	return nil
+	return mnt, nil
 }
 
 // UmountAt removes the Mount at the given path.
@@ -343,6 +350,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 	return nil
 }
 
+// +stateify savable
 type umountRecursiveOptions struct {
 	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
 	// on umounted mounts fail.
@@ -369,8 +377,9 @@ type umountRecursiveOptions struct {
 //
 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
 func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
 	if !mnt.umounted {
 		mnt.umounted = true
@@ -399,9 +408,11 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // connectLocked makes vd the mount parent/point for mnt. It consumes
 // references held by vd.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
-// must not already be connected.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * d.mu must be locked.
+// * mnt.parent() == nil, i.e. mnt must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
 	if checkInvariants {
 		if mnt.parent() != nil {
@@ -409,7 +420,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 		}
 	}
 	mnt.IncRef() // dropped by callers of umountRecursiveLocked
-	mnt.storeKey(vd)
+	mnt.setKey(vd)
 	if vd.mount.children == nil {
 		vd.mount.children = make(map[*Mount]struct{})
 	}
@@ -429,16 +440,18 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // disconnectLocked makes vd have no mount parent/point and returns its old
 // mount parent/point with a reference held.
 //
-// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. mnt.parent() != nil.
+// Preconditions:
+// * vfs.mountMu must be locked.
+// * vfs.mounts.seq must be in a writer critical section.
+// * mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
-	vd := mnt.loadKey()
+	vd := mnt.getKey()
 	if checkInvariants {
 		if vd.mount != nil {
 			panic("VFS.disconnectLocked called on disconnected mount")
 		}
 	}
-	mnt.storeKey(VirtualDentry{})
+	mnt.loadKey(VirtualDentry{})
 	delete(vd.mount.children, mnt)
 	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
 	mnt.ns.mountpoints[vd.dentry]--
@@ -480,35 +493,42 @@ func (mnt *Mount) IncRef() {
 
 // DecRef decrements mnt's reference count.
 func (mnt *Mount) DecRef(ctx context.Context) {
-	refs := atomic.AddInt64(&mnt.refs, -1)
-	if refs&^math.MinInt64 == 0 { // mask out MSB
-		var vd VirtualDentry
-		if mnt.parent() != nil {
-			mnt.vfs.mountMu.Lock()
-			mnt.vfs.mounts.seq.BeginWrite()
-			vd = mnt.vfs.disconnectLocked(mnt)
-			mnt.vfs.mounts.seq.EndWrite()
-			mnt.vfs.mountMu.Unlock()
-		}
-		mnt.root.DecRef(ctx)
-		mnt.fs.DecRef(ctx)
-		if vd.Ok() {
-			vd.DecRef(ctx)
+	r := atomic.AddInt64(&mnt.refs, -1)
+	if r&^math.MinInt64 == 0 { // mask out MSB
+		if refsvfs2.LeakCheckEnabled() {
+			refsvfs2.Unregister(mnt, "vfs.Mount")
 		}
+		mnt.destroy(ctx)
 	}
 }
 
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
-	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
-		panic("MountNamespace.IncRef() called without holding a reference")
+func (mnt *Mount) destroy(ctx context.Context) {
+	var vd VirtualDentry
+	if mnt.parent() != nil {
+		mnt.vfs.mountMu.Lock()
+		mnt.vfs.mounts.seq.BeginWrite()
+		vd = mnt.vfs.disconnectLocked(mnt)
+		mnt.vfs.mounts.seq.EndWrite()
+		mnt.vfs.mountMu.Unlock()
+	}
+	if mnt.root != nil {
+		mnt.root.DecRef(ctx)
 	}
+	mnt.fs.DecRef(ctx)
+	if vd.Ok() {
+		vd.DecRef(ctx)
+	}
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (mnt *Mount) LeakMessage() string {
+	return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs))
 }
 
 // DecRef decrements mntns' reference count.
 func (mntns *MountNamespace) DecRef(ctx context.Context) {
 	vfs := mntns.root.fs.VirtualFilesystem()
-	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+	mntns.MountNamespaceRefs.DecRef(func() {
 		vfs.mountMu.Lock()
 		vfs.mounts.seq.BeginWrite()
 		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
@@ -522,9 +542,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) {
 		for _, mnt := range mountsToDecRef {
 			mnt.DecRef(ctx)
 		}
-	} else if refs < 0 {
-		panic("MountNamespace.DecRef() called without holding a reference")
-	}
+	})
 }
 
 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -576,8 +594,9 @@ retryFirst:
 // mnt. It takes a reference on the returned VirtualDentry. If no such mount
 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
 //
-// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
-// mnt.root).
+// Preconditions:
+// * References are held on mnt and root.
+// * vfsroot is not (mnt, mnt.root).
 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
 	// The first mount is special-cased:
 	//
@@ -651,6 +670,13 @@ retryFirst:
 	return VirtualDentry{mnt, d}
 }
 
+// SetMountReadOnly sets the mount as ReadOnly.
+func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	return mnt.setReadOnlyLocked(ro)
+}
+
 // CheckBeginWrite increments the counter of in-progress write operations on
 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
 // EROFS.
@@ -717,14 +743,12 @@ func (mnt *Mount) Root() *Dentry {
 	return mnt.root
 }
 
-// Root returns mntns' root. A reference is taken on the returned
-// VirtualDentry.
+// Root returns mntns' root. It does not take a reference on the returned Dentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
 	vd := VirtualDentry{
 		mount:  mntns.root,
 		dentry: mntns.root.root,
 	}
-	vd.IncRef()
 	return vd
 }
 
@@ -732,11 +756,23 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 //
 // Preconditions: taskRootDir.Ok().
 func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
-	vfs.mountMu.Lock()
-	defer vfs.mountMu.Unlock()
 	rootMnt := taskRootDir.mount
+
+	vfs.mountMu.Lock()
 	mounts := rootMnt.submountsLocked()
+	// Take a reference on mounts since we need to drop vfs.mountMu before
+	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
+	for _, mnt := range mounts {
+		mnt.IncRef()
+	}
+	vfs.mountMu.Unlock()
+	defer func() {
+		for _, mnt := range mounts {
+			mnt.DecRef(ctx)
+		}
+	}()
 	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
 	for _, mnt := range mounts {
 		// Get the path to this mount relative to task root.
 		mntRootVD := VirtualDentry{
@@ -747,7 +783,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 		if err != nil {
 			// For some reason we didn't get a path. Log a warning
 			// and run with empty path.
-			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
 			path = ""
 		}
 		if path == "" {
@@ -781,11 +817,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 //
 // Preconditions: taskRootDir.Ok().
 func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
-	vfs.mountMu.Lock()
-	defer vfs.mountMu.Unlock()
 	rootMnt := taskRootDir.mount
+
+	vfs.mountMu.Lock()
 	mounts := rootMnt.submountsLocked()
+	// Take a reference on mounts since we need to drop vfs.mountMu before
+	// calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
+	// vfs.StatAt() (=> FilesystemImpl.StatAt()).
+	for _, mnt := range mounts {
+		mnt.IncRef()
+	}
+	vfs.mountMu.Unlock()
+	defer func() {
+		for _, mnt := range mounts {
+			mnt.DecRef(ctx)
+		}
+	}()
 	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+
+	creds := auth.CredentialsFromContext(ctx)
 	for _, mnt := range mounts {
 		// Get the path to this mount relative to task root.
 		mntRootVD := VirtualDentry{
@@ -796,7 +846,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		if err != nil {
 			// For some reason we didn't get a path. Log a warning
 			// and run with empty path.
-			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
 			path = ""
 		}
 		if path == "" {
@@ -809,9 +859,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 			Root:  mntRootVD,
 			Start: mntRootVD,
 		}
-		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
 		if err != nil {
 			// Well that's not good. Ignore this mount.
+			ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
 			break
 		}
 
@@ -823,6 +874,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		fmt.Fprintf(buf, "%d ", mnt.ID)
 
 		// (2)  Parent ID (or this ID if there is no parent).
+		// Note that even if the call to mnt.parent() races with Mount
+		// destruction (which is possible since we're not holding vfs.mountMu),
+		// its Mount.ID will still be valid.
 		pID := mnt.ID
 		if p := mnt.parent(); p != nil {
 			pID = p.ID
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3335e4057..cb8c56bd3 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	mt.Init()
 
 	mount := &Mount{}
-	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	mount.setKey(VirtualDentry{&Mount{}, &Dentry{}})
 	mt.Insert(mount)
 
 	if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false
 
 func newBenchMount() *Mount {
 	mount := &Mount{}
-	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}})
 	return mount
 }
 
@@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
 					mt.Insert(mount)
-					keys = append(keys, mount.loadKey())
+					keys = append(keys, mount.saveKey())
 				}
 
 				var ready sync.WaitGroup
@@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := mount.loadKey()
+					key := mount.saveKey()
 					ms[key] = mount
 					keys = append(keys, key)
 				}
@@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := mount.loadKey()
+					key := mount.getKey()
 					ms.Store(key, mount)
 					keys = append(keys, key)
 				}
@@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
 			ms := make(map[VirtualDentry]*Mount)
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms[mount.loadKey()] = mount
+				ms[mount.getKey()] = mount
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
 			var ms sync.Map
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms.Store(mount.loadKey(), mount)
+				ms.Store(mount.saveKey(), mount)
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms[mount.loadKey()] = mount
+		ms[mount.saveKey()] = mount
 	}
 }
 
@@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(mount.loadKey(), mount)
+		ms.Store(mount.saveKey(), mount)
 	}
 }
 
@@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
 	ms := make(map[VirtualDentry]*Mount)
 	for i := range mounts {
 		mount := mounts[i]
-		ms[mount.loadKey()] = mount
+		ms[mount.saveKey()] = mount
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		delete(ms, mount.loadKey())
+		delete(ms, mount.saveKey())
 	}
 }
 
@@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
 	var ms sync.Map
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(mount.loadKey(), mount)
+		ms.Store(mount.saveKey(), mount)
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Delete(mount.loadKey())
+		ms.Delete(mount.saveKey())
 	}
 }
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 70f850ca4..cb48c37a1 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -34,6 +34,8 @@ import (
 // structurally identical to VirtualDentry, but stores its fields as
 // unsafe.Pointer since mutators synchronize with VFS path traversal using
 // seqcounts.
+//
+// This is explicitly not savable.
 type mountKey struct {
 	parent unsafe.Pointer // *Mount
 	point  unsafe.Pointer // *Dentry
@@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry {
 	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
 }
 
-func (mnt *Mount) loadKey() VirtualDentry {
+func (mnt *Mount) getKey() VirtualDentry {
 	return VirtualDentry{
 		mount:  mnt.parent(),
 		dentry: mnt.point(),
 	}
 }
 
+func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }
+
 // Invariant: mnt.key.parent == nil. vd.Ok().
-func (mnt *Mount) storeKey(vd VirtualDentry) {
+func (mnt *Mount) setKey(vd VirtualDentry) {
 	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
 	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
 }
 
+func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }
+
 // mountTable maps (mount parent, mount point) pairs to mounts. It supports
 // efficient concurrent lookup, even in the presence of concurrent mutators
 // (provided mutation is sufficiently uncommon).
@@ -205,6 +211,26 @@ loop:
 	}
 }
 
+// Range calls f on each Mount in mt. If f returns false, Range stops iteration
+// and returns immediately.
+func (mt *mountTable) Range(f func(*Mount) bool) {
+	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+	slotPtr := mt.slots
+	last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes))
+	for {
+		slot := (*mountSlot)(slotPtr)
+		if slot.value != nil {
+			if !f((*Mount)(slot.value)) {
+				return
+			}
+		}
+		if slotPtr == last {
+			return
+		}
+		slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes)
+	}
+}
+
 // Insert inserts the given mount into mt.
 //
 // Preconditions: mt must not already contain a Mount with the same mount point
@@ -217,8 +243,9 @@ func (mt *mountTable) Insert(mount *Mount) {
 
 // insertSeqed inserts the given mount into mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must not
-// already contain a Mount with the same mount point and parent.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must not already contain a Mount with the same mount point and parent.
 func (mt *mountTable) insertSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 
@@ -269,9 +296,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) {
 	atomic.StorePointer(&mt.slots, newSlots)
 }
 
-// Preconditions: There are no concurrent mutators of the table (slots, cap).
-// If the table is visible to readers, then mt.seq must be in a writer critical
-// section. cap must be a power of 2.
+// Preconditions:
+// * There are no concurrent mutators of the table (slots, cap).
+// * If the table is visible to readers, then mt.seq must be in a writer
+//   critical section.
+// * cap must be a power of 2.
 func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
 	mask := cap - 1
 	off := (hash & mask) * mountSlotBytes
@@ -313,8 +342,9 @@ func (mt *mountTable) Remove(mount *Mount) {
 
 // removeSeqed removes the given mount from mt.
 //
-// Preconditions: mt.seq must be in a writer critical section. mt must contain
-// mount.
+// Preconditions:
+// * mt.seq must be in a writer critical section.
+// * mt must contain mount.
 func (mt *mountTable) removeSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index dfc8573fd..bc79e5ecc 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -21,6 +21,8 @@ import (
 
 // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
 // FilesystemImpl.GetDentryAt().
+//
+// +stateify savable
 type GetDentryOptions struct {
 	// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
 	// the returned Dentry is a directory for which creds has search
@@ -30,6 +32,8 @@ type GetDentryOptions struct {
 
 // MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
 // FilesystemImpl.MkdirAt().
+//
+// +stateify savable
 type MkdirOptions struct {
 	// Mode is the file mode bits for the created directory.
 	Mode linux.FileMode
@@ -56,6 +60,8 @@ type MkdirOptions struct {
 
 // MknodOptions contains options to VirtualFilesystem.MknodAt() and
 // FilesystemImpl.MknodAt().
+//
+// +stateify savable
 type MknodOptions struct {
 	// Mode is the file type and mode bits for the created file.
 	Mode linux.FileMode
@@ -72,6 +78,8 @@ type MknodOptions struct {
 
 // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
 // MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+//
+// +stateify savable
 type MountFlags struct {
 	// NoExec is equivalent to MS_NOEXEC.
 	NoExec bool
@@ -93,6 +101,8 @@ type MountFlags struct {
 }
 
 // MountOptions contains options to VirtualFilesystem.MountAt().
+//
+// +stateify savable
 type MountOptions struct {
 	// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
 	Flags MountFlags
@@ -103,13 +113,17 @@ type MountOptions struct {
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
 
-	// If InternalMount is true, allow the use of filesystem types for which
-	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	// InternalMount indicates whether the mount operation is coming from the
+	// application, i.e. through mount(2). If InternalMount is true, allow the use
+	// of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount
+	// == false.
 	InternalMount bool
 }
 
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
 // FilesystemImpl.OpenAt().
+//
+// +stateify savable
 type OpenOptions struct {
 	// Flags contains access mode and flags as specified for open(2).
 	//
@@ -135,6 +149,8 @@ type OpenOptions struct {
 // ReadOptions contains options to FileDescription.PRead(),
 // FileDescriptionImpl.PRead(), FileDescription.Read(), and
 // FileDescriptionImpl.Read().
+//
+// +stateify savable
 type ReadOptions struct {
 	// Flags contains flags as specified for preadv2(2).
 	Flags uint32
@@ -142,6 +158,8 @@ type ReadOptions struct {
 
 // RenameOptions contains options to VirtualFilesystem.RenameAt() and
 // FilesystemImpl.RenameAt().
+//
+// +stateify savable
 type RenameOptions struct {
 	// Flags contains flags as specified for renameat2(2).
 	Flags uint32
@@ -153,6 +171,8 @@ type RenameOptions struct {
 // SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
 // FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
 // FileDescriptionImpl.SetStat().
+//
+// +stateify savable
 type SetStatOptions struct {
 	// Stat is the metadata that should be set. Only fields indicated by
 	// Stat.Mask should be set.
@@ -174,6 +194,8 @@ type SetStatOptions struct {
 
 // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
 // and FilesystemImpl.BoundEndpointAt().
+//
+// +stateify savable
 type BoundEndpointOptions struct {
 	// Addr is the path of the file whose socket endpoint is being retrieved.
 	// It is generally irrelevant: most endpoints are stored at a dentry that
@@ -190,10 +212,12 @@ type BoundEndpointOptions struct {
 	Addr string
 }
 
-// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
-// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
-// FileDescriptionImpl.Getxattr().
-type GetxattrOptions struct {
+// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
+// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
+// FileDescriptionImpl.GetXattr().
+//
+// +stateify savable
+type GetXattrOptions struct {
 	// Name is the name of the extended attribute to retrieve.
 	Name string
 
@@ -204,10 +228,12 @@ type GetxattrOptions struct {
 	Size uint64
 }
 
-// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
-// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
-// FileDescriptionImpl.Setxattr().
-type SetxattrOptions struct {
+// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
+// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
+// FileDescriptionImpl.SetXattr().
+//
+// +stateify savable
+type SetXattrOptions struct {
 	// Name is the name of the extended attribute being mutated.
 	Name string
 
@@ -221,6 +247,8 @@ type SetxattrOptions struct {
 // StatOptions contains options to VirtualFilesystem.StatAt(),
 // FilesystemImpl.StatAt(), FileDescription.Stat(), and
 // FileDescriptionImpl.Stat().
+//
+// +stateify savable
 type StatOptions struct {
 	// Mask is the set of fields in the returned Statx that the FilesystemImpl
 	// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
@@ -238,6 +266,8 @@ type StatOptions struct {
 }
 
 // UmountOptions contains options to VirtualFilesystem.UmountAt().
+//
+// +stateify savable
 type UmountOptions struct {
 	// Flags contains flags as specified for umount2(2).
 	Flags uint32
@@ -246,6 +276,8 @@ type UmountOptions struct {
 // WriteOptions contains options to FileDescription.PWrite(),
 // FileDescriptionImpl.PWrite(), FileDescription.Write(), and
 // FileDescriptionImpl.Write().
+//
+// +stateify savable
 type WriteOptions struct {
 	// Flags contains flags as specified for pwritev2(2).
 	Flags uint32
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 33389c1df..d48520d58 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"math"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -25,6 +26,8 @@ import (
 )
 
 // AccessTypes is a bitmask of Unix file permissions.
+//
+// +stateify savable
 type AccessTypes uint16
 
 // Bits in AccessTypes.
@@ -271,7 +274,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth
 // operation must not proceed. Otherwise it returns the max length allowed to
 // without violating the limit.
 func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
-	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur
 	if fileSizeLimit > math.MaxInt64 {
 		return size, nil
 	}
@@ -284,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
 	}
 	return size, nil
 }
+
+// CheckXattrPermissions checks permissions for extended attribute access.
+// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
+// * Does not check for read-only filesystem property.
+// * Does not check inode immutability or append only mode. In both cases EPERM
+//   must be returned by filesystem implementations.
+// * Does not do inode permission checks. Filesystem implementations should
+//   handle inode permission checks as they may differ across implementations.
+func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
+	switch {
+	case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
+		// The trusted.* namespace can only be accessed by privileged
+		// users.
+		if creds.HasCapability(linux.CAP_SYS_ADMIN) {
+			return nil
+		}
+		if ats.MayWrite() {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
+		// In the user.* namespace, only regular files and directories can have
+		// extended attributes. For sticky directories, only the owner and
+		// privileged users can write attributes.
+		filetype := mode.FileType()
+		if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
+			if ats.MayWrite() {
+				return syserror.EPERM
+			}
+			return syserror.ENODATA
+		}
+		if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 3304372d9..e4fd55012 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -35,6 +35,8 @@ import (
 // FilesystemImpl methods.
 //
 // ResolvingPath is loosely analogous to Linux's struct nameidata.
+//
+// +stateify savable
 type ResolvingPath struct {
 	vfs   *VirtualFilesystem
 	root  VirtualDentry // refs borrowed from PathOperation
@@ -88,6 +90,7 @@ func init() {
 // so error "constants" are really mutable vars, necessitating somewhat
 // expensive interface object comparisons.
 
+// +stateify savable
 type resolveMountRootOrJumpError struct{}
 
 // Error implements error.Error.
@@ -95,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string {
 	return "resolving mount root or jump"
 }
 
+// +stateify savable
 type resolveMountPointError struct{}
 
 // Error implements error.Error.
@@ -102,6 +106,7 @@ func (resolveMountPointError) Error() string {
 	return "resolving mount point"
 }
 
+// +stateify savable
 type resolveAbsSymlinkError struct{}
 
 // Error implements error.Error.
diff --git a/pkg/sentry/vfs/save_restore.go b/pkg/sentry/vfs/save_restore.go
new file mode 100644
index 000000000..46e50d55d
--- /dev/null
+++ b/pkg/sentry/vfs/save_restore.go
@@ -0,0 +1,124 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+// FilesystemImplSaveRestoreExtension is an optional extension to
+// FilesystemImpl.
+type FilesystemImplSaveRestoreExtension interface {
+	// PrepareSave prepares this filesystem for serialization.
+	PrepareSave(ctx context.Context) error
+
+	// CompleteRestore completes restoration from checkpoint for this
+	// filesystem after deserialization.
+	CompleteRestore(ctx context.Context, opts CompleteRestoreOptions) error
+}
+
+// PrepareSave prepares all filesystems for serialization.
+func (vfs *VirtualFilesystem) PrepareSave(ctx context.Context) error {
+	failures := 0
+	for fs := range vfs.getFilesystems() {
+		if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
+			if err := ext.PrepareSave(ctx); err != nil {
+				ctx.Warningf("%T.PrepareSave failed: %v", fs.impl, err)
+				failures++
+			}
+		}
+		fs.DecRef(ctx)
+	}
+	if failures != 0 {
+		return fmt.Errorf("%d filesystems failed to prepare for serialization", failures)
+	}
+	return nil
+}
+
+// CompleteRestore completes restoration from checkpoint for all filesystems
+// after deserialization.
+func (vfs *VirtualFilesystem) CompleteRestore(ctx context.Context, opts *CompleteRestoreOptions) error {
+	failures := 0
+	for fs := range vfs.getFilesystems() {
+		if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
+			if err := ext.CompleteRestore(ctx, *opts); err != nil {
+				ctx.Warningf("%T.CompleteRestore failed: %v", fs.impl, err)
+				failures++
+			}
+		}
+		fs.DecRef(ctx)
+	}
+	if failures != 0 {
+		return fmt.Errorf("%d filesystems failed to complete restore after deserialization", failures)
+	}
+	return nil
+}
+
+// CompleteRestoreOptions contains options to
+// VirtualFilesystem.CompleteRestore() and
+// FilesystemImplSaveRestoreExtension.CompleteRestore().
+type CompleteRestoreOptions struct {
+	// If ValidateFileSizes is true, filesystem implementations backed by
+	// remote filesystems should verify that file sizes have not changed
+	// between checkpoint and restore.
+	ValidateFileSizes bool
+
+	// If ValidateFileModificationTimestamps is true, filesystem
+	// implementations backed by remote filesystems should validate that file
+	// mtimes have not changed between checkpoint and restore.
+	ValidateFileModificationTimestamps bool
+}
+
+// saveMounts is called by stateify.
+func (vfs *VirtualFilesystem) saveMounts() []*Mount {
+	if atomic.LoadPointer(&vfs.mounts.slots) == nil {
+		// vfs.Init() was never called.
+		return nil
+	}
+	var mounts []*Mount
+	vfs.mounts.Range(func(mount *Mount) bool {
+		mounts = append(mounts, mount)
+		return true
+	})
+	return mounts
+}
+
+// loadMounts is called by stateify.
+func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) {
+	if mounts == nil {
+		return
+	}
+	vfs.mounts.Init()
+	for _, mount := range mounts {
+		vfs.mounts.Insert(mount)
+	}
+}
+
+func (mnt *Mount) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&mnt.refs) != 0 {
+		refsvfs2.Register(mnt, "vfs.Mount")
+	}
+}
+
+// afterLoad is called by stateify.
+func (epi *epollInterest) afterLoad() {
+	// Mark all epollInterests as ready after restore so that the next call to
+	// EpollInstance.ReadEvents() rechecks their readiness.
+	epi.Callback(nil)
+}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9c2420683..48d6252f7 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -24,9 +24,9 @@
 //           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 //         VirtualFilesystem.filesystemsMu
 //       EpollInstance.mu
-//		   Inotify.mu
-// 		     Watches.mu
-//  		     Inotify.evMu
+//       Inotify.mu
+//         Watches.mu
+//           Inotify.evMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -36,6 +36,7 @@ package vfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -70,7 +71,7 @@ type VirtualFilesystem struct {
 	// points.
 	//
 	// mounts is analogous to Linux's mount_hashtable.
-	mounts mountTable
+	mounts mountTable `state:".([]*Mount)"`
 
 	// mountpoints maps mount points to mounts at those points in all
 	// namespaces. mountpoints is protected by mountMu.
@@ -157,11 +158,23 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
 	return nil
 }
 
+// Release drops references on filesystem objects held by vfs.
+//
+// Precondition: This must be called after VFS.Init() has succeeded.
+func (vfs *VirtualFilesystem) Release(ctx context.Context) {
+	vfs.anonMount.DecRef(ctx)
+	for _, fst := range vfs.fsTypes {
+		fst.fsType.Release(ctx)
+	}
+}
+
 // PathOperation specifies the path operated on by a VFS method.
 //
 // PathOperation is passed to VFS methods by pointer to reduce memory copying:
 // it's somewhat large and should never escape. (Options structs are passed by
 // pointer to VFS and FileDescription methods for the same reason.)
+//
+// +stateify savable
 type PathOperation struct {
 	// Root is the VFS root. References on Root are borrowed from the provider
 	// of the PathOperation.
@@ -296,6 +309,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mkdirat(dirfd, "", mode).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -332,6 +347,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with mknodat(dirfd, "", mode, dev).
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -517,6 +534,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -598,6 +617,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with symlinkat(oldpath, newdirfd, "").
 		if pop.Path.Absolute {
 			return syserror.EEXIST
 		}
@@ -630,6 +651,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
 	if !pop.Path.Begin.Ok() {
+		// pop.Path should not be empty in operations that create/delete files.
+		// This is consistent with unlinkat(dirfd, "", 0).
 		if pop.Path.Absolute {
 			return syserror.EBUSY
 		}
@@ -661,12 +684,6 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // BoundEndpointAt gets the bound endpoint at the given path, if one exists.
 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
-	if !pop.Path.Begin.Ok() {
-		if pop.Path.Absolute {
-			return nil, syserror.ECONNREFUSED
-		}
-		return nil, syserror.ENOENT
-	}
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
@@ -686,12 +703,12 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C
 	}
 }
 
-// ListxattrAt returns all extended attribute names for the file at the given
+// ListXattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return names, nil
@@ -711,12 +728,12 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
-// GetxattrAt returns the value associated with the given extended attribute
+// GetXattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return val, nil
@@ -728,12 +745,12 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// SetxattrAt changes the value associated with the given extended attribute
+// SetXattrAt changes the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
@@ -745,11 +762,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
 	}
 }
 
-// RemovexattrAt removes the given extended attribute from the file at rp.
-func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+// RemoveXattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
 		if err == nil {
 			vfs.putResolvingPath(ctx, rp)
 			return nil
@@ -763,23 +780,83 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre
 
 // SyncAllFilesystems has the semantics of Linux's sync(2).
 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+	var retErr error
+	for fs := range vfs.getFilesystems() {
+		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+			retErr = err
+		}
+		fs.DecRef(ctx)
+	}
+	return retErr
+}
+
+func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} {
 	fss := make(map[*Filesystem]struct{})
 	vfs.filesystemsMu.Lock()
+	defer vfs.filesystemsMu.Unlock()
 	for fs := range vfs.filesystems {
 		if !fs.TryIncRef() {
 			continue
 		}
 		fss[fs] = struct{}{}
 	}
-	vfs.filesystemsMu.Unlock()
-	var retErr error
-	for fs := range fss {
-		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
-			retErr = err
-		}
-		fs.DecRef(ctx)
+	return fss
+}
+
+// MkdirAllAt recursively creates non-existent directories on the given path
+// (including the last component).
+func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
+	pop := &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
+	switch err {
+	case nil:
+		if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
+			return syserror.ENOTDIR
+		}
+		// Directory already exists.
+		return nil
+	case syserror.ENOENT:
+		// Expected, we will create the dir.
+	default:
+		return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the final directory.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
+		return err
 	}
-	return retErr
+	if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
+	}
+	return nil
+}
+
+// MakeSyntheticMountpoint creates parent directories of target if they do not
+// exist and attempts to create a directory for the mountpoint. If a
+// non-directory file already exists there then we allow it.
+func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
+	mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+
+	// Make sure the parent directory of target exists.
+	if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
+	}
+
+	// Attempt to mkdir the final component. If a file (of any type) exists
+	// then we let allow mounting on top of that because we do not require the
+	// target to be an existing directory, unlike Linux mount(2).
+	if err := vfs.MkdirAt(ctx, creds, &PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(target),
+	}, mkdirOpts); err != nil && err != syserror.EEXIST {
+		return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
+	}
+	return nil
 }
 
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 748273366..bbafb8b7f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -96,15 +96,33 @@ const (
 	Panic
 )
 
+// Set implements flag.Value.
+func (a *Action) Set(v string) error {
+	switch v {
+	case "log", "logwarning":
+		*a = LogWarning
+	case "panic":
+		*a = Panic
+	default:
+		return fmt.Errorf("invalid watchdog action %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (a *Action) Get() interface{} {
+	return *a
+}
+
 // String returns Action's string representation.
-func (a Action) String() string {
-	switch a {
+func (a *Action) String() string {
+	switch *a {
 	case LogWarning:
-		return "LogWarning"
+		return "logWarning"
 	case Panic:
-		return "Panic"
+		return "panic"
 	default:
-		panic(fmt.Sprintf("Invalid action: %d", a))
+		panic(fmt.Sprintf("Invalid watchdog action: %d", *a))
 	}
 }
 
diff --git a/pkg/shim/v2/runtimeoptions/BUILD b/pkg/shim/v2/runtimeoptions/BUILD
index ba2ed1ea7..abb8c3be3 100644
--- a/pkg/shim/v2/runtimeoptions/BUILD
+++ b/pkg/shim/v2/runtimeoptions/BUILD
@@ -11,12 +11,12 @@ proto_library(
 
 go_library(
     name = "runtimeoptions",
-    srcs = ["runtimeoptions.go"],
-    visibility = ["//pkg/shim/v2:__pkg__"],
-    deps = [
-        ":api_go_proto",
-        "@com_github_gogo_protobuf//proto:go_default_library",
+    srcs = [
+        "runtimeoptions.go",
+        "runtimeoptions_cri.go",
     ],
+    visibility = ["//pkg/shim/v2:__pkg__"],
+    deps = ["@com_github_gogo_protobuf//proto:go_default_library"],
 )
 
 go_test(
@@ -27,6 +27,6 @@ go_test(
     deps = [
         "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
         "@com_github_containerd_typeurl//:go_default_library",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_gogo_protobuf//proto:go_default_library",
     ],
 )
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.go b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
index aaf17b87a..072dd87f0 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
@@ -13,18 +13,5 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package runtimeoptions contains the runtimeoptions proto.
 package runtimeoptions
-
-import (
-	proto "github.com/gogo/protobuf/proto"
-	pb "gvisor.dev/gvisor/pkg/shim/v2/runtimeoptions/api_go_proto"
-)
-
-type Options = pb.Options
-
-func init() {
-	// The generated proto file auto registers with "golang/protobuf/proto"
-	// package. However, typeurl uses "golang/gogo/protobuf/proto". So registers
-	// the type there too.
-	proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
-}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
new file mode 100644
index 000000000..e6102b4cf
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_cri.go
@@ -0,0 +1,383 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runtimeoptions
+
+import (
+	"fmt"
+	"io"
+	"reflect"
+	"strings"
+
+	proto "github.com/gogo/protobuf/proto"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
+
+type Options struct {
+	// TypeUrl specifies the type of the content inside the config file.
+	TypeUrl string `protobuf:"bytes,1,opt,name=type_url,json=typeUrl,proto3" json:"type_url,omitempty"`
+	// ConfigPath specifies the filesystem location of the config file
+	// used by the runtime.
+	ConfigPath string `protobuf:"bytes,2,opt,name=config_path,json=configPath,proto3" json:"config_path,omitempty"`
+}
+
+func (m *Options) Reset()                    { *m = Options{} }
+func (*Options) ProtoMessage()               {}
+func (*Options) Descriptor() ([]byte, []int) { return fileDescriptorApi, []int{0} }
+
+func (m *Options) GetTypeUrl() string {
+	if m != nil {
+		return m.TypeUrl
+	}
+	return ""
+}
+
+func (m *Options) GetConfigPath() string {
+	if m != nil {
+		return m.ConfigPath
+	}
+	return ""
+}
+
+func init() {
+	proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
+}
+
+func (m *Options) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalTo(dAtA)
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *Options) MarshalTo(dAtA []byte) (int, error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	if len(m.TypeUrl) > 0 {
+		dAtA[i] = 0xa
+		i++
+		i = encodeVarintApi(dAtA, i, uint64(len(m.TypeUrl)))
+		i += copy(dAtA[i:], m.TypeUrl)
+	}
+	if len(m.ConfigPath) > 0 {
+		dAtA[i] = 0x12
+		i++
+		i = encodeVarintApi(dAtA, i, uint64(len(m.ConfigPath)))
+		i += copy(dAtA[i:], m.ConfigPath)
+	}
+	return i, nil
+}
+
+func encodeVarintApi(dAtA []byte, offset int, v uint64) int {
+	for v >= 1<<7 {
+		dAtA[offset] = uint8(v&0x7f | 0x80)
+		v >>= 7
+		offset++
+	}
+	dAtA[offset] = uint8(v)
+	return offset + 1
+}
+
+func (m *Options) Size() (n int) {
+	var l int
+	_ = l
+	l = len(m.TypeUrl)
+	if l > 0 {
+		n += 1 + l + sovApi(uint64(l))
+	}
+	l = len(m.ConfigPath)
+	if l > 0 {
+		n += 1 + l + sovApi(uint64(l))
+	}
+	return n
+}
+
+func sovApi(x uint64) (n int) {
+	for {
+		n++
+		x >>= 7
+		if x == 0 {
+			break
+		}
+	}
+	return n
+}
+
+func sozApi(x uint64) (n int) {
+	return sovApi(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+
+func (this *Options) String() string {
+	if this == nil {
+		return "nil"
+	}
+	s := strings.Join([]string{`&Options{`,
+		`TypeUrl:` + fmt.Sprintf("%v", this.TypeUrl) + `,`,
+		`ConfigPath:` + fmt.Sprintf("%v", this.ConfigPath) + `,`,
+		`}`,
+	}, "")
+	return s
+}
+
+func valueToStringApi(v interface{}) string {
+	rv := reflect.ValueOf(v)
+	if rv.IsNil() {
+		return "nil"
+	}
+	pv := reflect.Indirect(rv).Interface()
+	return fmt.Sprintf("*%v", pv)
+}
+
+func (m *Options) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowApi
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: Options: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: Options: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field TypeUrl", wireType)
+			}
+			var stringLen uint64
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				stringLen |= (uint64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			intStringLen := int(stringLen)
+			if intStringLen < 0 {
+				return ErrInvalidLengthApi
+			}
+			postIndex := iNdEx + intStringLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.TypeUrl = string(dAtA[iNdEx:postIndex])
+			iNdEx = postIndex
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field ConfigPath", wireType)
+			}
+			var stringLen uint64
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				stringLen |= (uint64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			intStringLen := int(stringLen)
+			if intStringLen < 0 {
+				return ErrInvalidLengthApi
+			}
+			postIndex := iNdEx + intStringLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.ConfigPath = string(dAtA[iNdEx:postIndex])
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipApi(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if skippy < 0 {
+				return ErrInvalidLengthApi
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+func skipApi(dAtA []byte) (n int, err error) {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return 0, ErrIntOverflowApi
+			}
+			if iNdEx >= l {
+				return 0, io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		wireType := int(wire & 0x7)
+		switch wireType {
+		case 0:
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				iNdEx++
+				if dAtA[iNdEx-1] < 0x80 {
+					break
+				}
+			}
+			return iNdEx, nil
+		case 1:
+			iNdEx += 8
+			return iNdEx, nil
+		case 2:
+			var length int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowApi
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				length |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			iNdEx += length
+			if length < 0 {
+				return 0, ErrInvalidLengthApi
+			}
+			return iNdEx, nil
+		case 3:
+			for {
+				var innerWire uint64
+				var start int = iNdEx
+				for shift := uint(0); ; shift += 7 {
+					if shift >= 64 {
+						return 0, ErrIntOverflowApi
+					}
+					if iNdEx >= l {
+						return 0, io.ErrUnexpectedEOF
+					}
+					b := dAtA[iNdEx]
+					iNdEx++
+					innerWire |= (uint64(b) & 0x7F) << shift
+					if b < 0x80 {
+						break
+					}
+				}
+				innerWireType := int(innerWire & 0x7)
+				if innerWireType == 4 {
+					break
+				}
+				next, err := skipApi(dAtA[start:])
+				if err != nil {
+					return 0, err
+				}
+				iNdEx = start + next
+			}
+			return iNdEx, nil
+		case 4:
+			return iNdEx, nil
+		case 5:
+			iNdEx += 4
+			return iNdEx, nil
+		default:
+			return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
+		}
+	}
+	panic("unreachable")
+}
+
+var (
+	ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling")
+	ErrIntOverflowApi   = fmt.Errorf("proto: integer overflow")
+)
+
+func init() { proto.RegisterFile("api.proto", fileDescriptorApi) }
+
+var fileDescriptorApi = []byte{
+	// 183 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4,
+	0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x12, 0x4d, 0x2e, 0xca, 0xd4, 0x2b, 0x2a, 0xcd, 0x2b, 0xc9,
+	0xcc, 0x4d, 0xcd, 0x2f, 0x28, 0xc9, 0xcc, 0xcf, 0x2b, 0xd6, 0x2b, 0x33, 0x94, 0xd2, 0x4d, 0xcf,
+	0x2c, 0xc9, 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0xcf, 0x4f, 0xcf, 0xd7, 0x07, 0xab,
+	0x4e, 0x2a, 0x4d, 0x03, 0xf3, 0xc0, 0x1c, 0x30, 0x0b, 0x62, 0x8a, 0x92, 0x2b, 0x17, 0xbb, 0x3f,
+	0x44, 0xb3, 0x90, 0x24, 0x17, 0x47, 0x49, 0x65, 0x41, 0x6a, 0x7c, 0x69, 0x51, 0x8e, 0x04, 0xa3,
+	0x02, 0xa3, 0x06, 0x67, 0x10, 0x3b, 0x88, 0x1f, 0x5a, 0x94, 0x23, 0x24, 0xcf, 0xc5, 0x9d, 0x9c,
+	0x9f, 0x97, 0x96, 0x99, 0x1e, 0x5f, 0x90, 0x58, 0x92, 0x21, 0xc1, 0x04, 0x96, 0xe5, 0x82, 0x08,
+	0x05, 0x24, 0x96, 0x64, 0x38, 0xc9, 0x9c, 0x78, 0x28, 0xc7, 0x78, 0xe3, 0xa1, 0x1c, 0x43, 0xc3,
+	0x23, 0x39, 0xc6, 0x13, 0x8f, 0xe4, 0x18, 0x2f, 0x3c, 0x92, 0x63, 0x7c, 0xf0, 0x48, 0x8e, 0x71,
+	0xc2, 0x63, 0x39, 0x86, 0x24, 0x36, 0xb0, 0x5d, 0xc6, 0x80, 0x00, 0x00, 0x00, 0xff, 0xff, 0x07,
+	0x00, 0xf2, 0x18, 0xbe, 0x00, 0x00, 0x00,
+}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
index f4c238a00..c59a2400e 100644
--- a/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions_test.go
@@ -15,11 +15,12 @@
 package runtimeoptions
 
 import (
+	"bytes"
 	"testing"
 
 	shim "github.com/containerd/containerd/runtime/v1/shim/v1"
 	"github.com/containerd/typeurl"
-	"github.com/golang/protobuf/proto"
+	"github.com/gogo/protobuf/proto"
 )
 
 func TestCreateTaskRequest(t *testing.T) {
@@ -32,7 +33,11 @@ func TestCreateTaskRequest(t *testing.T) {
 	if err := proto.UnmarshalText(encodedText, got); err != nil {
 		t.Fatalf("unable to unmarshal text: %v", err)
 	}
-	t.Logf("got: %s", proto.MarshalTextString(got))
+	var textBuffer bytes.Buffer
+	if err := proto.MarshalText(&textBuffer, got); err != nil {
+		t.Errorf("unable to marshal text: %v", err)
+	}
+	t.Logf("got: %s", string(textBuffer.Bytes()))
 
 	// Check the options.
 	wantOptions := &Options{}
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 118805492..19bce2afb 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 089b3bbef..92c51879b 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -4,19 +4,6 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 package(licenses = ["notice"])
 
 go_template_instance(
-    name = "pending_list",
-    out = "pending_list.go",
-    package = "state",
-    prefix = "pending",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*objectEncodeState",
-        "ElementMapper": "pendingMapper",
-        "Linker": "*pendingEntry",
-    },
-)
-
-go_template_instance(
     name = "deferred_list",
     out = "deferred_list.go",
     package = "state",
@@ -83,7 +70,6 @@ go_library(
         "deferred_list.go",
         "encode.go",
         "encode_unsafe.go",
-        "pending_list.go",
         "state.go",
         "state_norace.go",
         "state_race.go",
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index c9971cdf6..e519ddeca 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -21,6 +21,7 @@ import (
 	"math"
 	"reflect"
 
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/state/wire"
 )
 
@@ -258,7 +259,7 @@ func (ds *decodeState) waitObject(ods *objectDecodeState, encoded wire.Object, c
 // For the purposes of this function, a child object is either a field within a
 // struct or an array element, with one such indirection per element in
 // path. The returned value may be an unexported field, so it may not be
-// directly assignable. See unsafePointerTo.
+// directly assignable. See decode_unsafe.go.
 func walkChild(path []wire.Dot, obj reflect.Value) reflect.Value {
 	// See wire.Ref.Dots. The path here is specified in reverse order.
 	for i := len(path) - 1; i >= 0; i-- {
@@ -519,9 +520,7 @@ func (ds *decodeState) decodeObject(ods *objectDecodeState, obj reflect.Value, e
 
 		// Normal assignment: authoritative only if no dots.
 		v := ds.register(x, obj.Type().Elem())
-		if v.IsValid() {
-			obj.Set(unsafePointerTo(v))
-		}
+		obj.Set(reflectValueRWAddr(v))
 	case wire.Bool:
 		obj.SetBool(bool(x))
 	case wire.Int:
@@ -559,7 +558,7 @@ func (ds *decodeState) decodeObject(ods *objectDecodeState, obj reflect.Value, e
 		// contents will still be filled in later on.
 		typ := reflect.ArrayOf(int(x.Capacity), obj.Type().Elem()) // The object type.
 		v := ds.register(&x.Ref, typ)
-		obj.Set(v.Slice3(0, int(x.Length), int(x.Capacity)))
+		obj.Set(reflectValueRWSlice3(v, 0, int(x.Length), int(x.Capacity)))
 	case *wire.Array:
 		ds.decodeArray(ods, obj, x)
 	case *wire.Struct:
@@ -584,13 +583,15 @@ func (ds *decodeState) Load(obj reflect.Value) {
 	})
 
 	// Create the root object.
-	ds.objectsByID = append(ds.objectsByID, &objectDecodeState{
+	rootOds := &objectDecodeState{
 		id:  1,
 		obj: obj,
-	})
+	}
+	ds.objectsByID = append(ds.objectsByID, rootOds)
+	ds.pending.PushBack(rootOds)
 
 	// Read the number of objects.
-	lastID, object, err := ReadHeader(ds.r)
+	numObjects, object, err := ReadHeader(ds.r)
 	if err != nil {
 		Failf("header error: %w", err)
 	}
@@ -602,42 +603,44 @@ func (ds *decodeState) Load(obj reflect.Value) {
 	var (
 		encoded wire.Object
 		ods     *objectDecodeState
-		id      = objectID(1)
+		id      objectID
 		tid     = typeID(1)
 	)
 	if err := safely(func() {
 		// Decode all objects in the stream.
 		//
-		// Note that the structure of this decoding loop should match
-		// the raw decoding loop in printer.go.
-		for id <= objectID(lastID) {
-			// Unmarshal the object.
+		// Note that the structure of this decoding loop should match the raw
+		// decoding loop in state/pretty/pretty.printer.printStream().
+		for i := uint64(0); i < numObjects; {
+			// Unmarshal either a type object or object ID.
 			encoded = wire.Load(ds.r)
-
-			// Is this a type object? Handle inline.
-			if wt, ok := encoded.(*wire.Type); ok {
-				ds.types.Register(wt)
+			switch we := encoded.(type) {
+			case *wire.Type:
+				ds.types.Register(we)
 				tid++
 				encoded = nil
 				continue
+			case wire.Uint:
+				id = objectID(we)
+				i++
+				// Unmarshal and resolve the actual object.
+				encoded = wire.Load(ds.r)
+				ods = ds.lookup(id)
+				if ods != nil {
+					// Decode the object.
+					ds.decodeObject(ods, ods.obj, encoded)
+				} else {
+					// If an object hasn't had interest registered
+					// previously or isn't yet valid, we deferred
+					// decoding until interest is registered.
+					ds.deferred[id] = encoded
+				}
+				// For error handling.
+				ods = nil
+				encoded = nil
+			default:
+				Failf("wanted type or object ID, got %#v", encoded)
 			}
-
-			// Actually resolve the object.
-			ods = ds.lookup(id)
-			if ods != nil {
-				// Decode the object.
-				ds.decodeObject(ods, ods.obj, encoded)
-			} else {
-				// If an object hasn't had interest registered
-				// previously or isn't yet valid, we deferred
-				// decoding until interest is registered.
-				ds.deferred[id] = encoded
-			}
-
-			// For error handling.
-			ods = nil
-			encoded = nil
-			id++
 		}
 	}); err != nil {
 		// Include as much information as we can, taking into account
@@ -645,16 +648,25 @@ func (ds *decodeState) Load(obj reflect.Value) {
 		if ods != nil {
 			Failf("error decoding object ID %d (%T) from %#v: %w", id, ods.obj.Interface(), encoded, err)
 		} else if encoded != nil {
-			Failf("lookup error decoding object ID %d from %#v: %w", id, encoded, err)
+			Failf("error decoding from %#v: %w", encoded, err)
 		} else {
 			Failf("general decoding error: %w", err)
 		}
 	}
 
 	// Check if we have any deferred objects.
+	numDeferred := 0
 	for id, encoded := range ds.deferred {
-		// Shoud never happen, the graph was bogus.
-		Failf("still have deferred objects: one is ID %d, %#v", id, encoded)
+		numDeferred++
+		if s, ok := encoded.(*wire.Struct); ok && s.TypeID != 0 {
+			typ := ds.types.LookupType(typeID(s.TypeID))
+			log.Warningf("unused deferred object: ID %d, type %v", id, typ)
+		} else {
+			log.Warningf("unused deferred object: ID %d, %#v", id, encoded)
+		}
+	}
+	if numDeferred != 0 {
+		Failf("still had %d deferred objects", numDeferred)
 	}
 
 	// Scan and fire all callbacks. We iterate over the list of incomplete
diff --git a/pkg/state/decode_unsafe.go b/pkg/state/decode_unsafe.go
index d048f61a1..f1208e2a2 100644
--- a/pkg/state/decode_unsafe.go
+++ b/pkg/state/decode_unsafe.go
@@ -15,13 +15,62 @@
 package state
 
 import (
+	"fmt"
 	"reflect"
+	"runtime"
 	"unsafe"
 )
 
-// unsafePointerTo is logically equivalent to reflect.Value.Addr, but works on
-// values representing unexported fields. This bypasses visibility, but not
-// type safety.
-func unsafePointerTo(obj reflect.Value) reflect.Value {
+// reflectValueRWAddr is equivalent to obj.Addr(), except that the returned
+// reflect.Value is usable in assignments even if obj was obtained by the use
+// of unexported struct fields.
+//
+// Preconditions: obj.CanAddr().
+func reflectValueRWAddr(obj reflect.Value) reflect.Value {
 	return reflect.NewAt(obj.Type(), unsafe.Pointer(obj.UnsafeAddr()))
 }
+
+// reflectValueRWSlice3 is equivalent to arr.Slice3(i, j, k), except that the
+// returned reflect.Value is usable in assignments even if obj was obtained by
+// the use of unexported struct fields.
+//
+// Preconditions:
+// * arr.Kind() == reflect.Array.
+// * i, j, k >= 0.
+// * i <= j <= k <= arr.Len().
+func reflectValueRWSlice3(arr reflect.Value, i, j, k int) reflect.Value {
+	if arr.Kind() != reflect.Array {
+		panic(fmt.Sprintf("arr has kind %v, wanted %v", arr.Kind(), reflect.Array))
+	}
+	if i < 0 || j < 0 || k < 0 {
+		panic(fmt.Sprintf("negative subscripts (%d, %d, %d)", i, j, k))
+	}
+	if i > j {
+		panic(fmt.Sprintf("subscript i (%d) > j (%d)", i, j))
+	}
+	if j > k {
+		panic(fmt.Sprintf("subscript j (%d) > k (%d)", j, k))
+	}
+	if k > arr.Len() {
+		panic(fmt.Sprintf("subscript k (%d) > array length (%d)", k, arr.Len()))
+	}
+
+	sliceTyp := reflect.SliceOf(arr.Type().Elem())
+	if i == arr.Len() {
+		// By precondition, i == j == k == arr.Len().
+		return reflect.MakeSlice(sliceTyp, 0, 0)
+	}
+	slh := reflect.SliceHeader{
+		// reflect.Value.CanAddr() == false for arrays, so we need to get the
+		// address from the first element of the array.
+		Data: arr.Index(i).UnsafeAddr(),
+		Len:  j - i,
+		Cap:  k - i,
+	}
+	slobj := reflect.NewAt(sliceTyp, unsafe.Pointer(&slh)).Elem()
+	// Before slobj is constructed, arr holds the only pointer-typed pointer to
+	// the array since reflect.SliceHeader.Data is a uintptr, so arr must be
+	// kept alive.
+	runtime.KeepAlive(arr)
+	return slobj
+}
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index 92fcad4e9..560e7c2a3 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -17,13 +17,14 @@ package state
 import (
 	"context"
 	"reflect"
+	"sort"
 
 	"gvisor.dev/gvisor/pkg/state/wire"
 )
 
 // objectEncodeState the type and identity of an object occupying a memory
 // address range. This is the value type for addrSet, and the intrusive entry
-// for the pending and deferred lists.
+// for the deferred list.
 type objectEncodeState struct {
 	// id is the assigned ID for this object.
 	id objectID
@@ -47,7 +48,6 @@ type objectEncodeState struct {
 	// references may be updated directly and automatically.
 	refs []*wire.Ref
 
-	pendingEntry
 	deferredEntry
 }
 
@@ -93,9 +93,15 @@ type encodeState struct {
 	// serialized.
 	pendingTypes []wire.Type
 
-	// pending is the list of objects to be serialized. Serialization does
+	// pending maps object IDs to objects to be serialized. Serialization does
 	// not actually occur until the full object graph is computed.
-	pending pendingList
+	pending map[objectID]*objectEncodeState
+
+	// encodedStructs maps reflect.Values representing structs to previous
+	// encodings of those structs. This is necessary to avoid duplicate calls
+	// to SaverLoader.StateSave() that may result in multiple calls to
+	// Sink.SaveValue() for a given field, resulting in object duplication.
+	encodedStructs map[reflect.Value]*wire.Struct
 
 	// stats tracks time data.
 	stats Stats
@@ -189,7 +195,8 @@ func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) {
 			// depending on this value knows there's nothing there.
 			return
 		}
-		if seg, _ := es.values.Find(addr); seg.Ok() {
+		seg, gap := es.values.Find(addr)
+		if seg.Ok() {
 			// Ensure the map types match.
 			existing := seg.Value()
 			if existing.obj.Type() != obj.Type() {
@@ -203,13 +210,20 @@ func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) {
 		}
 
 		// Record the map.
+		r := addrRange{addr, addr + 1}
 		oes := &objectEncodeState{
 			id:  es.nextID(),
 			obj: obj,
 			how: encodeMapAsValue,
 		}
-		es.values.Add(addrRange{addr, addr + 1}, oes)
-		es.pending.PushBack(oes)
+		// Use Insert instead of InsertWithoutMergingUnchecked when race
+		// detection is enabled to get additional sanity-checking from Merge.
+		if !raceEnabled {
+			es.values.InsertWithoutMergingUnchecked(gap, r, oes)
+		} else {
+			es.values.Insert(gap, r, oes)
+		}
+		es.pending[oes.id] = oes
 		es.deferred.PushBack(oes)
 
 		// See above: no ref recording.
@@ -245,7 +259,7 @@ func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) {
 					obj: obj,
 				}
 				es.zeroValues[typ] = oes
-				es.pending.PushBack(oes)
+				es.pending[oes.id] = oes
 				es.deferred.PushBack(oes)
 			}
 
@@ -258,86 +272,112 @@ func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) {
 		size = 1 // See above.
 	}
 
-	// Calculate the container.
 	end := addr + size
 	r := addrRange{addr, end}
-	if seg, _ := es.values.Find(addr); seg.Ok() {
+	seg := es.values.LowerBoundSegment(addr)
+	var (
+		oes *objectEncodeState
+		gap addrGapIterator
+	)
+
+	// Does at least one previously-registered object overlap this one?
+	if seg.Ok() && seg.Start() < end {
 		existing := seg.Value()
-		switch {
-		case seg.Start() == addr && seg.End() == end && obj.Type() == existing.obj.Type():
-			// The object is a perfect match. Happy path. Avoid the
-			// traversal and just return directly. We don't need to
-			// encode the type information or any dots here.
+
+		if seg.Range() == r && typ == existing.obj.Type() {
+			// This exact object is already registered. Avoid the traversal and
+			// just return directly. We don't need to encode the type
+			// information or any dots here.
 			ref.Root = wire.Uint(existing.id)
 			existing.refs = append(existing.refs, ref)
 			return
+		}
 
-		case (seg.Start() < addr && seg.End() >= end) || (seg.Start() <= addr && seg.End() > end):
-			// The previously registered object is larger than
-			// this, no need to update. But we expect some
-			// traversal below.
+		if seg.Range().IsSupersetOf(r) && (seg.Range() != r || isSameSizeParent(existing.obj, typ)) {
+			// This object is contained within a previously-registered object.
+			// Perform traversal from the container to the new object.
+			ref.Root = wire.Uint(existing.id)
+			ref.Dots = traverse(existing.obj.Type(), typ, seg.Start(), addr)
+			ref.Type = es.findType(existing.obj.Type())
+			existing.refs = append(existing.refs, ref)
+			return
+		}
 
-		case seg.Start() == addr && seg.End() == end:
-			if !isSameSizeParent(obj, existing.obj.Type()) {
-				break // Needs traversal.
+		// This object contains one or more previously-registered objects.
+		// Remove them and update existing references to use the new one.
+		oes := &objectEncodeState{
+			// Reuse the root ID of the first contained element.
+			id:  existing.id,
+			obj: obj,
+		}
+		type elementEncodeState struct {
+			addr uintptr
+			typ  reflect.Type
+			refs []*wire.Ref
+		}
+		var (
+			elems []elementEncodeState
+			gap   addrGapIterator
+		)
+		for {
+			// Each contained object should be completely contained within
+			// this one.
+			if raceEnabled && !r.IsSupersetOf(seg.Range()) {
+				Failf("containing object %#v does not contain existing object %#v", obj, existing.obj)
 			}
-			fallthrough // Needs update.
-
-		case (seg.Start() > addr && seg.End() <= end) || (seg.Start() >= addr && seg.End() < end):
-			// Update the object and redo the encoding.
-			old := existing.obj
-			existing.obj = obj
+			elems = append(elems, elementEncodeState{
+				addr: seg.Start(),
+				typ:  existing.obj.Type(),
+				refs: existing.refs,
+			})
+			delete(es.pending, existing.id)
 			es.deferred.Remove(existing)
-			es.deferred.PushBack(existing)
-
-			// The previously registered object is superseded by
-			// this new object. We are guaranteed to not have any
-			// mergeable neighbours in this segment set.
-			if !raceEnabled {
-				seg.SetRangeUnchecked(r)
-			} else {
-				// Add extra paranoid. This will be statically
-				// removed at compile time unless a race build.
-				es.values.Remove(seg)
-				es.values.Add(r, existing)
-				seg = es.values.LowerBoundSegment(addr)
+			gap = es.values.Remove(seg)
+			seg = gap.NextSegment()
+			if !seg.Ok() || seg.Start() >= end {
+				break
 			}
-
-			// Compute the traversal required & update references.
-			dots := traverse(obj.Type(), old.Type(), addr, seg.Start())
-			wt := es.findType(obj.Type())
-			for _, ref := range existing.refs {
+			existing = seg.Value()
+		}
+		wt := es.findType(typ)
+		for _, elem := range elems {
+			dots := traverse(typ, elem.typ, addr, elem.addr)
+			for _, ref := range elem.refs {
+				ref.Root = wire.Uint(oes.id)
 				ref.Dots = append(ref.Dots, dots...)
 				ref.Type = wt
 			}
-		default:
-			// There is a non-sensical overlap.
-			Failf("overlapping objects: [new object] %#v [existing object] %#v", obj, existing.obj)
+			oes.refs = append(oes.refs, elem.refs...)
 		}
-
-		// Compute the new reference, record and return it.
-		ref.Root = wire.Uint(existing.id)
-		ref.Dots = traverse(existing.obj.Type(), obj.Type(), seg.Start(), addr)
-		ref.Type = es.findType(obj.Type())
-		existing.refs = append(existing.refs, ref)
+		// Finally register the new containing object.
+		if !raceEnabled {
+			es.values.InsertWithoutMergingUnchecked(gap, r, oes)
+		} else {
+			es.values.Insert(gap, r, oes)
+		}
+		es.pending[oes.id] = oes
+		es.deferred.PushBack(oes)
+		ref.Root = wire.Uint(oes.id)
+		oes.refs = append(oes.refs, ref)
 		return
 	}
 
-	// The only remaining case is a pointer value that doesn't overlap with
-	// any registered addresses. Create a new entry for it, and start
-	// tracking the first reference we just created.
-	oes := &objectEncodeState{
+	// No existing object overlaps this one. Register a new object.
+	oes = &objectEncodeState{
 		id:  es.nextID(),
 		obj: obj,
 	}
+	if seg.Ok() {
+		gap = seg.PrevGap()
+	} else {
+		gap = es.values.LastGap()
+	}
 	if !raceEnabled {
-		es.values.AddWithoutMerging(r, oes)
+		es.values.InsertWithoutMergingUnchecked(gap, r, oes)
 	} else {
-		// Merges should never happen. This is just enabled extra
-		// sanity checks because the Merge function below will panic.
-		es.values.Add(r, oes)
+		es.values.Insert(gap, r, oes)
 	}
-	es.pending.PushBack(oes)
+	es.pending[oes.id] = oes
 	es.deferred.PushBack(oes)
 	ref.Root = wire.Uint(oes.id)
 	oes.refs = append(oes.refs, ref)
@@ -439,6 +479,14 @@ func (oe *objectEncoder) save(slot int, obj reflect.Value) {
 
 // encodeStruct encodes a composite object.
 func (es *encodeState) encodeStruct(obj reflect.Value, dest *wire.Object) {
+	if s, ok := es.encodedStructs[obj]; ok {
+		*dest = s
+		return
+	}
+	s := &wire.Struct{}
+	*dest = s
+	es.encodedStructs[obj] = s
+
 	// Ensure that the obj is addressable. There are two cases when it is
 	// not. First, is when this is dispatched via SaveValue. Second, when
 	// this is a map key as a struct. Either way, we need to make a copy to
@@ -449,10 +497,6 @@ func (es *encodeState) encodeStruct(obj reflect.Value, dest *wire.Object) {
 		obj = localObj.Elem()
 	}
 
-	// Prepare the value.
-	s := &wire.Struct{}
-	*dest = s
-
 	// Look the type up in the database.
 	te, ok := es.types.Lookup(obj.Type())
 	if te == nil {
@@ -730,45 +774,43 @@ func (es *encodeState) Save(obj reflect.Value) {
 		Failf("encoding error at object %#v: %w", oes.obj.Interface(), err)
 	}
 
-	// Check that items are pending.
-	if es.pending.Front() == nil {
+	// Check that we have objects to serialize.
+	if len(es.pending) == 0 {
 		Failf("pending is empty?")
 	}
 
-	// Write the header with the number of objects. Note that there is no
-	// way that es.lastID could conflict with objectID, which would
-	// indicate that an impossibly large encoding.
-	if err := WriteHeader(es.w, uint64(es.lastID), true); err != nil {
+	// Write the header with the number of objects.
+	if err := WriteHeader(es.w, uint64(len(es.pending)), true); err != nil {
 		Failf("error writing header: %w", err)
 	}
 
 	// Serialize all pending types and pending objects. Note that we don't
 	// bother removing from this list as we walk it because that just
 	// wastes time. It will not change after this point.
-	var id objectID
 	if err := safely(func() {
 		for _, wt := range es.pendingTypes {
 			// Encode the type.
 			wire.Save(es.w, &wt)
 		}
-		for oes = es.pending.Front(); oes != nil; oes = oes.pendingEntry.Next() {
-			id++ // First object is 1.
-			if oes.id != id {
-				Failf("expected id %d, got %d", id, oes.id)
-			}
-
-			// Marshall the object.
+		// Emit objects in ID order.
+		ids := make([]objectID, 0, len(es.pending))
+		for id := range es.pending {
+			ids = append(ids, id)
+		}
+		sort.Slice(ids, func(i, j int) bool {
+			return ids[i] < ids[j]
+		})
+		for _, id := range ids {
+			// Encode the id.
+			wire.Save(es.w, wire.Uint(id))
+			// Marshal the object.
+			oes := es.pending[id]
 			wire.Save(es.w, oes.encoded)
 		}
 	}); err != nil {
 		// Include the object and the error.
 		Failf("error serializing object %#v: %w", oes.encoded, err)
 	}
-
-	// Check what we wrote.
-	if id != es.lastID {
-		Failf("expected %d objects, wrote %d", es.lastID, id)
-	}
 }
 
 // objectFlag indicates that the length is a # of objects, rather than a raw
@@ -797,11 +839,6 @@ func WriteHeader(w wire.Writer, length uint64, object bool) error {
 	})
 }
 
-// pendingMapper is for the pending list.
-type pendingMapper struct{}
-
-func (pendingMapper) linkerFor(oes *objectEncodeState) *pendingEntry { return &oes.pendingEntry }
-
 // deferredMapper is for the deferred list.
 type deferredMapper struct{}
 
diff --git a/pkg/state/pretty/pretty.go b/pkg/state/pretty/pretty.go
index cf37aaa49..c6e8bb31d 100644
--- a/pkg/state/pretty/pretty.go
+++ b/pkg/state/pretty/pretty.go
@@ -26,17 +26,23 @@ import (
 	"gvisor.dev/gvisor/pkg/state/wire"
 )
 
-func formatRef(x *wire.Ref, graph uint64, html bool) string {
+type printer struct {
+	html      bool
+	typeSpecs map[string]*wire.Type
+}
+
+func (p *printer) formatRef(x *wire.Ref, graph uint64) string {
 	baseRef := fmt.Sprintf("g%dr%d", graph, x.Root)
 	fullRef := baseRef
 	if len(x.Dots) > 0 {
 		// See wire.Ref; Type valid if Dots non-zero.
-		typ, _ := formatType(x.Type, graph, html)
+		typ, _ := p.formatType(x.Type, graph)
 		var buf strings.Builder
 		buf.WriteString("(*")
 		buf.WriteString(typ)
 		buf.WriteString(")(")
 		buf.WriteString(baseRef)
+		buf.WriteString(")")
 		for _, component := range x.Dots {
 			switch v := component.(type) {
 			case *wire.FieldName:
@@ -48,37 +54,42 @@ func formatRef(x *wire.Ref, graph uint64, html bool) string {
 				panic(fmt.Sprintf("unreachable: switch should be exhaustive, unhandled case %v", reflect.TypeOf(component)))
 			}
 		}
-		buf.WriteString(")")
 		fullRef = buf.String()
 	}
-	if html {
+	if p.html {
 		return fmt.Sprintf("<a href=\"#%s\">%s</a>", baseRef, fullRef)
 	}
 	return fullRef
 }
 
-func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
+func (p *printer) formatType(t wire.TypeSpec, graph uint64) (string, bool) {
 	switch x := t.(type) {
 	case wire.TypeID:
-		base := fmt.Sprintf("g%dt%d", graph, x)
-		if html {
-			return fmt.Sprintf("<a href=\"#%s\">%s</a>", base, base), true
+		tag := fmt.Sprintf("g%dt%d", graph, x)
+		desc := tag
+		if spec, ok := p.typeSpecs[tag]; ok {
+			desc += fmt.Sprintf("=%s", spec.Name)
+		} else {
+			desc += "!missing-type-spec"
 		}
-		return fmt.Sprintf("%s", base), true
+		if p.html {
+			return fmt.Sprintf("<a href=\"#%s\">%s</a>", tag, desc), true
+		}
+		return desc, true
 	case wire.TypeSpecNil:
 		return "", false // Only nil type.
 	case *wire.TypeSpecPointer:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("(*%s)", element), true
 	case *wire.TypeSpecArray:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("[%d](%s)", x.Count, element), true
 	case *wire.TypeSpecSlice:
-		element, _ := formatType(x.Type, graph, html)
+		element, _ := p.formatType(x.Type, graph)
 		return fmt.Sprintf("([]%s)", element), true
 	case *wire.TypeSpecMap:
-		key, _ := formatType(x.Key, graph, html)
-		value, _ := formatType(x.Value, graph, html)
+		key, _ := p.formatType(x.Key, graph)
+		value, _ := p.formatType(x.Value, graph)
 		return fmt.Sprintf("(map[%s]%s)", key, value), true
 	default:
 		panic(fmt.Sprintf("unreachable: unknown type %T", t))
@@ -87,7 +98,7 @@ func formatType(t wire.TypeSpec, graph uint64, html bool) (string, bool) {
 
 // format formats a single object, for pretty-printing. It also returns whether
 // the value is a non-zero value.
-func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bool) {
+func (p *printer) format(graph uint64, depth int, encoded wire.Object) (string, bool) {
 	switch x := encoded.(type) {
 	case wire.Nil:
 		return "nil", false
@@ -98,7 +109,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 	case *wire.Complex128:
 		return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0
 	case *wire.Ref:
-		return formatRef(x, graph, html), x.Root != 0
+		return p.formatRef(x, graph), x.Root != 0
 	case *wire.Type:
 		tabs := "\n" + strings.Repeat("\t", depth)
 		items := make([]string, 0, len(x.Fields)+2)
@@ -109,7 +120,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "}")
 		return strings.Join(items, tabs), true // No zero value.
 	case *wire.Slice:
-		return fmt.Sprintf("%s{len:%d,cap:%d}", formatRef(&x.Ref, graph, html), x.Length, x.Capacity), x.Capacity != 0
+		return fmt.Sprintf("%s{len:%d,cap:%d}", p.formatRef(&x.Ref, graph), x.Length, x.Capacity), x.Capacity != 0
 	case *wire.Array:
 		if len(x.Contents) == 0 {
 			return "[]", false
@@ -119,7 +130,7 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "[")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Contents); i++ {
-			item, ok := format(graph, depth+1, x.Contents[i], html)
+			item, ok := p.format(graph, depth+1, x.Contents[i])
 			if !ok {
 				zeros = append(zeros, fmt.Sprintf("\t%s,", item))
 				continue
@@ -136,7 +147,9 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "]")
 		return strings.Join(items, tabs), len(zeros) < len(x.Contents)
 	case *wire.Struct:
-		typ, _ := formatType(x.TypeID, graph, html)
+		tag := fmt.Sprintf("g%dt%d", graph, x.TypeID)
+		spec, _ := p.typeSpecs[tag]
+		typ, _ := p.formatType(x.TypeID, graph)
 		if x.Fields() == 0 {
 			return fmt.Sprintf("struct[%s]{}", typ), false
 		}
@@ -145,10 +158,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		tabs := "\n" + strings.Repeat("\t", depth)
 		allZero := true
 		for i := 0; i < x.Fields(); i++ {
-			element, ok := format(graph, depth+1, *x.Field(i), html)
+			var name string
+			if spec != nil && i < len(spec.Fields) {
+				name = spec.Fields[i]
+			} else {
+				name = fmt.Sprintf("%d", i)
+			}
+			element, ok := p.format(graph, depth+1, *x.Field(i))
 			allZero = allZero && !ok
-			items = append(items, fmt.Sprintf("\t%d: %s,", i, element))
-			i++
+			items = append(items, fmt.Sprintf("\t%s: %s,", name, element))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), !allZero
@@ -160,15 +178,15 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 		items = append(items, "map{")
 		tabs := "\n" + strings.Repeat("\t", depth)
 		for i := 0; i < len(x.Keys); i++ {
-			key, _ := format(graph, depth+1, x.Keys[i], html)
-			value, _ := format(graph, depth+1, x.Values[i], html)
+			key, _ := p.format(graph, depth+1, x.Keys[i])
+			value, _ := p.format(graph, depth+1, x.Values[i])
 			items = append(items, fmt.Sprintf("\t%s: %s,", key, value))
 		}
 		items = append(items, "}")
 		return strings.Join(items, tabs), true
 	case *wire.Interface:
-		typ, typOk := formatType(x.Type, graph, html)
-		element, elementOk := format(graph, depth+1, x.Value, html)
+		typ, typOk := p.formatType(x.Type, graph)
+		element, elementOk := p.format(graph, depth+1, x.Value)
 		return fmt.Sprintf("interface[%s]{%s}", typ, element), typOk || elementOk
 	default:
 		// Must be a primitive; use reflection.
@@ -177,11 +195,11 @@ func format(graph uint64, depth int, encoded wire.Object, html bool) (string, bo
 }
 
 // printStream is the basic print implementation.
-func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
+func (p *printer) printStream(w io.Writer, r wire.Reader) (err error) {
 	// current graph ID.
 	var graph uint64
 
-	if html {
+	if p.html {
 		fmt.Fprintf(w, "<pre>")
 		defer fmt.Fprintf(w, "</pre>")
 	}
@@ -196,6 +214,8 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		}
 	}()
 
+	p.typeSpecs = make(map[string]*wire.Type)
+
 	for {
 		// Find the first object to begin generation.
 		length, object, err := state.ReadHeader(r)
@@ -222,19 +242,23 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 		// Note that this loop must match the general structure of the
 		// loop in decode.go. But we don't register type information,
 		// etc. and just print the raw structures.
+		type objectAndID struct {
+			id  uint64
+			obj wire.Object
+		}
 		var (
-			oid uint64 = 1
-			tid uint64 = 1
+			tid     uint64 = 1
+			objects []objectAndID
 		)
-		for oid <= length {
-			// Unmarshal the object.
+		for i := uint64(0); i < length; {
+			// Unmarshal either a type object or object ID.
 			encoded := wire.Load(r)
-
-			// Is this a type?
-			if _, ok := encoded.(*wire.Type); ok {
-				str, _ := format(graph, 0, encoded, html)
+			switch we := encoded.(type) {
+			case *wire.Type:
+				str, _ := p.format(graph, 0, encoded)
 				tag := fmt.Sprintf("g%dt%d", graph, tid)
-				if html {
+				p.typeSpecs[tag] = we
+				if p.html {
 					// See below.
 					tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 				}
@@ -242,20 +266,29 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 					return err
 				}
 				tid++
-				continue
+			case wire.Uint:
+				// Unmarshal the actual object.
+				objects = append(objects, objectAndID{
+					id:  uint64(we),
+					obj: wire.Load(r),
+				})
+				i++
+			default:
+				return fmt.Errorf("wanted type or object ID, got %#v", encoded)
 			}
+		}
 
+		for _, objAndID := range objects {
 			// Format the node.
-			str, _ := format(graph, 0, encoded, html)
-			tag := fmt.Sprintf("g%dr%d", graph, oid)
-			if html {
+			str, _ := p.format(graph, 0, objAndID.obj)
+			tag := fmt.Sprintf("g%dr%d", graph, objAndID.id)
+			if p.html {
 				// Create a little tag with an anchor next to it for linking.
 				tag = fmt.Sprintf("<a name=\"%s\">%s</a><a href=\"#%s\">&#9875;</a>", tag, tag, tag)
 			}
 			if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil {
 				return err
 			}
-			oid++
 		}
 	}
 
@@ -264,10 +297,10 @@ func printStream(w io.Writer, r wire.Reader, html bool) (err error) {
 
 // PrintText reads the stream from r and prints text to w.
 func PrintText(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, false /* html */)
+	return (&printer{}).printStream(w, r)
 }
 
 // PrintHTML reads the stream from r and prints html to w.
 func PrintHTML(w io.Writer, r wire.Reader) error {
-	return printStream(w, r, true /* html */)
+	return (&printer{html: true}).printStream(w, r)
 }
diff --git a/pkg/state/state.go b/pkg/state/state.go
index acb629969..6b8540f03 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -90,10 +90,12 @@ func (e *ErrState) Unwrap() error {
 func Save(ctx context.Context, w wire.Writer, rootPtr interface{}) (Stats, error) {
 	// Create the encoding state.
 	es := encodeState{
-		ctx:        ctx,
-		w:          w,
-		types:      makeTypeEncodeDatabase(),
-		zeroValues: make(map[reflect.Type]*objectEncodeState),
+		ctx:            ctx,
+		w:              w,
+		types:          makeTypeEncodeDatabase(),
+		zeroValues:     make(map[reflect.Type]*objectEncodeState),
+		pending:        make(map[objectID]*objectEncodeState),
+		encodedStructs: make(map[reflect.Value]*wire.Struct),
 	}
 
 	// Perform the encoding.
diff --git a/pkg/state/tests/load_test.go b/pkg/state/tests/load_test.go
index 1e9794296..3c73ac391 100644
--- a/pkg/state/tests/load_test.go
+++ b/pkg/state/tests/load_test.go
@@ -20,6 +20,14 @@ import (
 
 func TestLoadHooks(t *testing.T) {
 	runTestCases(t, false, "load-hooks", []interface{}{
+		// Root object being a struct.
+		afterLoadStruct{v: 1},
+		valueLoadStruct{v: 1},
+		genericContainer{v: &afterLoadStruct{v: 1}},
+		genericContainer{v: &valueLoadStruct{v: 1}},
+		sliceContainer{v: []interface{}{&afterLoadStruct{v: 1}}},
+		sliceContainer{v: []interface{}{&valueLoadStruct{v: 1}}},
+		// Root object being a pointer.
 		&afterLoadStruct{v: 1},
 		&valueLoadStruct{v: 1},
 		&genericContainer{v: &afterLoadStruct{v: 1}},
diff --git a/pkg/state/tests/struct.go b/pkg/state/tests/struct.go
index bd2c2b399..69143d194 100644
--- a/pkg/state/tests/struct.go
+++ b/pkg/state/tests/struct.go
@@ -54,12 +54,47 @@ type outerArray struct {
 }
 
 // +stateify savable
+type outerSlice struct {
+	inner []inner
+}
+
+// +stateify savable
 type inner struct {
 	v int64
 }
 
 // +stateify savable
+type outerFieldValue struct {
+	inner innerFieldValue
+}
+
+// +stateify savable
+type innerFieldValue struct {
+	v int64 `state:".(*savedFieldValue)"`
+}
+
+// +stateify savable
+type savedFieldValue struct {
+	v int64
+}
+
+func (ifv *innerFieldValue) saveV() *savedFieldValue {
+	return &savedFieldValue{ifv.v}
+}
+
+func (ifv *innerFieldValue) loadV(sfv *savedFieldValue) {
+	ifv.v = sfv.v
+}
+
+// +stateify savable
 type system struct {
 	v1 interface{}
 	v2 interface{}
 }
+
+// +stateify savable
+type system3 struct {
+	v1 interface{}
+	v2 interface{}
+	v3 interface{}
+}
diff --git a/pkg/state/tests/struct_test.go b/pkg/state/tests/struct_test.go
index de9d17aa7..c91c2c032 100644
--- a/pkg/state/tests/struct_test.go
+++ b/pkg/state/tests/struct_test.go
@@ -15,6 +15,7 @@
 package tests
 
 import (
+	"math/rand"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/state"
@@ -67,12 +68,23 @@ func TestRegisterTypeOnlyStruct(t *testing.T) {
 }
 
 func TestEmbeddedPointers(t *testing.T) {
-	var (
-		ofs outerSame
-		of1 outerFieldFirst
-		of2 outerFieldSecond
-		oa  outerArray
-	)
+	// Give each int64 a random value to prevent Go from using
+	// runtime.staticuint64s, which confounds tests for struct duplication.
+	magic := func() int64 {
+		for {
+			n := rand.Int63()
+			if n < 0 || n > 255 {
+				return n
+			}
+		}
+	}
+
+	ofs := outerSame{inner{magic()}}
+	of1 := outerFieldFirst{inner{magic()}, magic()}
+	of2 := outerFieldSecond{magic(), inner{magic()}}
+	oa := outerArray{[2]inner{{magic()}, {magic()}}}
+	osl := outerSlice{oa.inner[:]}
+	ofv := outerFieldValue{innerFieldValue{magic()}}
 
 	runTestCases(t, false, "embedded-pointers", []interface{}{
 		system{&ofs, &ofs.inner},
@@ -85,5 +97,15 @@ func TestEmbeddedPointers(t *testing.T) {
 		system{&oa, &oa.inner[1]},
 		system{&oa.inner[0], &oa},
 		system{&oa.inner[1], &oa},
+		system3{&oa, &oa.inner[0], &oa.inner[1]},
+		system3{&oa, &oa.inner[1], &oa.inner[0]},
+		system3{&oa.inner[0], &oa, &oa.inner[1]},
+		system3{&oa.inner[1], &oa, &oa.inner[0]},
+		system3{&oa.inner[0], &oa.inner[1], &oa},
+		system3{&oa.inner[1], &oa.inner[0], &oa},
+		system{&oa, &osl},
+		system{&osl, &oa},
+		system{&ofv, &ofv.inner},
+		system{&ofv.inner, &ofv},
 	})
 }
diff --git a/pkg/state/types.go b/pkg/state/types.go
index 215ef80f8..84aed8732 100644
--- a/pkg/state/types.go
+++ b/pkg/state/types.go
@@ -107,6 +107,14 @@ func lookupNameFields(typ reflect.Type) (string, []string, bool) {
 		}
 		return name, nil, true
 	}
+	// Sanity check the type.
+	if raceEnabled {
+		if _, ok := reverseTypeDatabase[typ]; !ok {
+			// The type was not registered? Must be an embedded
+			// structure or something else.
+			return "", nil, false
+		}
+	}
 	// Extract the name from the object.
 	name := t.StateTypeName()
 	fields := t.StateFields()
@@ -313,6 +321,9 @@ var primitiveTypeDatabase = func() map[string]reflect.Type {
 // globalTypeDatabase is used for dispatching interfaces on decode.
 var globalTypeDatabase = map[string]reflect.Type{}
 
+// reverseTypeDatabase is a reverse mapping.
+var reverseTypeDatabase = map[reflect.Type]string{}
+
 // Register registers a type.
 //
 // This must be called on init and only done once.
@@ -358,4 +369,7 @@ func Register(t Type) {
 		Failf("conflicting name for %T: matches interfaceType", t)
 	}
 	globalTypeDatabase[name] = typ
+	if raceEnabled {
+		reverseTypeDatabase[typ] = name
+	}
 }
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 4d47207f7..68535c3b1 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -38,6 +38,7 @@ go_library(
         "race_unsafe.go",
         "rwmutex_unsafe.go",
         "seqcount.go",
+        "spin_unsafe.go",
         "sync.go",
     ],
     marshal = False,
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 1d7780695..f5e630009 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index dc034d561..f4c2e9642 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // When updating the build constraint (above), check that syncMutex matches the
 // standard library sync.Mutex definition.
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index 995c0346e..b3b4dee78 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index eda6fb131..2184cb5ab 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -25,41 +25,35 @@ import (
 type Value struct{}
 
 // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
+// with any writer critical sections in seq.
+//
+//go:nosplit
+func SeqAtomicLoad(seq *sync.SeqCount, ptr *Value) Value {
 	for {
-		epoch := sc.BeginRead()
-		if sync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
+		if val, ok := SeqAtomicTryLoad(seq, seq.BeginRead(), ptr); ok {
+			return val
 		}
 	}
-	return val
 }
 
 // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
+// in seq initiated by a call to seq.BeginRead() that returned epoch. If the
+// read would race with a writer critical section, SeqAtomicTryLoad returns
 // (unspecified, false).
-func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
+//
+//go:nosplit
+func SeqAtomicTryLoad(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (val Value, ok bool) {
 	if sync.RaceEnabled {
+		// runtime.RaceDisable() doesn't actually stop the race detector, so it
+		// can't help us here. Instead, call runtime.memmove directly, which is
+		// not instrumented by the race detector.
 		sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
 	} else {
+		// This is ~40% faster for short reads than going through memmove.
 		val = *ptr
 	}
-	return val, sc.ReadOk(epoch)
+	ok = seq.ReadOk(epoch)
+	return
 }
 
 func init() {
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index a1e895352..2c5d3df99 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -8,7 +8,6 @@ package sync
 import (
 	"fmt"
 	"reflect"
-	"runtime"
 	"sync/atomic"
 )
 
@@ -43,9 +42,7 @@ type SeqCount struct {
 }
 
 // SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
+type SeqCountEpoch uint32
 
 // We assume that:
 //
@@ -83,12 +80,25 @@ type SeqCountEpoch struct {
 // using this pattern. Most users of SeqCount will need to use the
 // SeqAtomicLoad function template in seqatomic.go.
 func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
+	if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+		return SeqCountEpoch(epoch)
+	}
+	return s.beginReadSlow()
+}
+
+func (s *SeqCount) beginReadSlow() SeqCountEpoch {
+	i := 0
+	for {
+		if canSpin(i) {
+			i++
+			doSpin()
+		} else {
+			goyield()
+		}
+		if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
+			return SeqCountEpoch(epoch)
+		}
 	}
-	return SeqCountEpoch{epoch}
 }
 
 // ReadOk returns true if the reader critical section initiated by a previous
@@ -99,7 +109,7 @@ func (s *SeqCount) BeginRead() SeqCountEpoch {
 // Reader critical sections do not need to be explicitly terminated; the last
 // call to ReadOk is implicitly the end of the reader critical section.
 func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
+	return atomic.LoadUint32(&s.epoch) == uint32(epoch)
 }
 
 // BeginWrite indicates the beginning of a writer critical section.
diff --git a/pkg/sync/spin_unsafe.go b/pkg/sync/spin_unsafe.go
new file mode 100644
index 000000000..cafb2d065
--- /dev/null
+++ b/pkg/sync/spin_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.17
+
+// Check go:linkname function signatures when updating Go version.
+
+package sync
+
+import (
+	_ "unsafe" // for go:linkname
+)
+
+//go:linkname canSpin sync.runtime_canSpin
+func canSpin(i int) bool
+
+//go:linkname doSpin sync.runtime_doSpin
+func doSpin()
+
+//go:linkname goyield runtime.goyield
+func goyield()
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
index 4bff59e7d..dabf08895 100644
--- a/pkg/syncevent/broadcaster.go
+++ b/pkg/syncevent/broadcaster.go
@@ -111,7 +111,9 @@ func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
 	return id
 }
 
-// Preconditions: table must not be full. len(table) is a power of 2.
+// Preconditions:
+// * table must not be full.
+// * len(table) is a power of 2.
 func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
 	entry := broadcasterSlot{
 		receiver: r,
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
index ddffb171a..d3d0f34c5 100644
--- a/pkg/syncevent/source.go
+++ b/pkg/syncevent/source.go
@@ -19,9 +19,11 @@ type Source interface {
 	// SubscribeEvents causes the Source to notify the given Receiver of the
 	// given subset of events.
 	//
-	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
-	// that are ordered prior to the Source; for example, it cannot call any
-	// Source methods.
+	// Preconditions:
+	// * r != nil.
+	// * The ReceiverCallback for r must not take locks that are ordered
+	//   prior to the Source; for example, it cannot call any Source
+	//   methods.
 	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
 
 	// UnsubscribeEvents causes the Source to stop notifying the Receiver
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
index ad271e1a0..518f18479 100644
--- a/pkg/syncevent/waiter_unsafe.go
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index fe9f50169..f516c8e46 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -33,6 +33,7 @@ var (
 	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
+	ECONNABORTED = error(syscall.ECONNABORTED)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
 	ECONNRESET   = error(syscall.ECONNRESET)
 	EDEADLK      = error(syscall.EDEADLK)
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index 29719752e..7036467c4 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -24,27 +24,20 @@ import (
 
 var globalError error
 
-func returnErrnoAsError() error {
-	return syscall.EINVAL
-}
-
-func returnError() error {
-	return syserror.EINVAL
-}
-
-func BenchmarkReturnErrnoAsError(b *testing.B) {
+func BenchmarkAssignErrno(b *testing.B) {
 	for i := b.N; i > 0; i-- {
-		returnErrnoAsError()
+		globalError = syscall.EINVAL
 	}
 }
 
-func BenchmarkReturnError(b *testing.B) {
+func BenchmarkAssignError(b *testing.B) {
 	for i := b.N; i > 0; i-- {
-		returnError()
+		globalError = syserror.EINVAL
 	}
 }
 
 func BenchmarkCompareErrno(b *testing.B) {
+	globalError = syscall.EAGAIN
 	j := 0
 	for i := b.N; i > 0; i-- {
 		if globalError == syscall.EINVAL {
@@ -54,6 +47,7 @@ func BenchmarkCompareErrno(b *testing.B) {
 }
 
 func BenchmarkCompareError(b *testing.B) {
+	globalError = syserror.EAGAIN
 	j := 0
 	for i := b.N; i > 0; i-- {
 		if globalError == syserror.EINVAL {
@@ -63,6 +57,7 @@ func BenchmarkCompareError(b *testing.B) {
 }
 
 func BenchmarkSwitchErrno(b *testing.B) {
+	globalError = syscall.EPERM
 	j := 0
 	for i := b.N; i > 0; i-- {
 		switch globalError {
@@ -77,6 +72,7 @@ func BenchmarkSwitchErrno(b *testing.B) {
 }
 
 func BenchmarkSwitchError(b *testing.B) {
+	globalError = syserror.EPERM
 	j := 0
 	for i := b.N; i > 0; i-- {
 		switch globalError {
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index d82ed5205..4f551cd92 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -245,7 +245,7 @@ func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
 
 // Accept implements net.Conn.Accept.
 func (l *TCPListener) Accept() (net.Conn, error) {
-	n, wq, err := l.ep.Accept()
+	n, wq, err := l.ep.Accept(nil)
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -254,7 +254,7 @@ func (l *TCPListener) Accept() (net.Conn, error) {
 		defer l.wq.EventUnregister(&waitEntry)
 
 		for {
-			n, wq, err = l.ep.Accept()
+			n, wq, err = l.ep.Accept(nil)
 
 			if err != tcpip.ErrWouldBlock {
 				break
@@ -541,7 +541,7 @@ func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress,
 		case <-notifyCh:
 		}
 
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		ep.Close()
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 3c552988a..b196324c7 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -61,8 +61,8 @@ func TestTimeouts(t *testing.T) {
 func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 	// Create the stack and add a NIC.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol},
 	})
 
 	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
@@ -97,6 +97,9 @@ type testConnection struct {
 func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Error) {
 	wq := &waiter.Queue{}
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		return nil, err
+	}
 
 	entry, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&entry, waiter.EventOut)
@@ -104,7 +107,7 @@ func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Er
 	err = ep.Connect(addr)
 	if err == tcpip.ErrConnectStarted {
 		<-ch
-		err = ep.GetSockOpt(tcpip.ErrorOption{})
+		err = ep.LastError()
 	}
 	if err != nil {
 		return nil, err
@@ -145,7 +148,9 @@ func TestCloseReader(t *testing.T) {
 		defer close(done)
 		c, err := l.Accept()
 		if err != nil {
-			t.Fatalf("l.Accept() = %v", err)
+			t.Errorf("l.Accept() = %v", err)
+			// Cannot call Fatalf in goroutine. Just return from the goroutine.
+			return
 		}
 
 		// Give c.Read() a chance to block before closing the connection.
@@ -416,7 +421,9 @@ func TestDeadlineChange(t *testing.T) {
 		defer close(done)
 		c, err := l.Accept()
 		if err != nil {
-			t.Fatalf("l.Accept() = %v", err)
+			t.Errorf("l.Accept() = %v", err)
+			// Cannot call Fatalf in goroutine. Just return from the goroutine.
+			return
 		}
 
 		c.SetDeadline(time.Now().Add(time.Minute))
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 563bc78ea..c326fab54 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -14,6 +14,8 @@ go_library(
 go_test(
     name = "buffer_test",
     size = "small",
-    srcs = ["view_test.go"],
+    srcs = [
+        "view_test.go",
+    ],
     library = ":buffer",
 )
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index ea0c5413d..8db70a700 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -84,8 +84,8 @@ type VectorisedView struct {
 	size  int
 }
 
-// NewVectorisedView creates a new vectorised view from an already-allocated slice
-// of View and sets its size.
+// NewVectorisedView creates a new vectorised view from an already-allocated
+// slice of View and sets its size.
 func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
@@ -170,8 +170,9 @@ func (vv *VectorisedView) CapLength(length int) {
 }
 
 // Clone returns a clone of this VectorisedView.
-// If the buffer argument is large enough to contain all the Views of this VectorisedView,
-// the method will avoid allocations and use the buffer to store the Views of the clone.
+// If the buffer argument is large enough to contain all the Views of this
+// VectorisedView, the method will avoid allocations and use the buffer to
+// store the Views of the clone.
 func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
@@ -209,7 +210,8 @@ func (vv *VectorisedView) PullUp(count int) (View, bool) {
 	return newFirst, true
 }
 
-// Size returns the size in bytes of the entire content stored in the vectorised view.
+// Size returns the size in bytes of the entire content stored in the
+// vectorised view.
 func (vv *VectorisedView) Size() int {
 	return vv.size
 }
@@ -222,6 +224,12 @@ func (vv *VectorisedView) ToView() View {
 	if len(vv.views) == 1 {
 		return vv.views[0]
 	}
+	return vv.ToOwnedView()
+}
+
+// ToOwnedView returns a single view containing the content of the vectorised
+// view that vv does not own.
+func (vv *VectorisedView) ToOwnedView() View {
 	u := make([]byte, 0, vv.size)
 	for _, v := range vv.views {
 		u = append(u, v...)
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index b769094dc..cf1145043 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -118,18 +118,100 @@ func TTL(ttl uint8) NetworkChecker {
 			v = ip.HopLimit()
 		}
 		if v != ttl {
-			t.Fatalf("Bad TTL, got %v, want %v", v, ttl)
+			t.Fatalf("Bad TTL, got = %d, want = %d", v, ttl)
+		}
+	}
+}
+
+// IPFullLength creates a checker for the full IP packet length. The
+// expected size is checked against both the Total Length in the
+// header and the number of bytes received.
+func IPFullLength(packetLength uint16) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		var v uint16
+		var l uint16
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			v = ip.TotalLength()
+			l = uint16(len(ip))
+		case header.IPv6:
+			v = ip.PayloadLength() + header.IPv6FixedHeaderSize
+			l = uint16(len(ip))
+		default:
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4 or header.IPv6", ip)
+		}
+		if l != packetLength {
+			t.Errorf("bad packet length, got = %d, want = %d", l, packetLength)
+		}
+		if v != packetLength {
+			t.Errorf("unexpected packet length in header, got = %d, want = %d", v, packetLength)
+		}
+	}
+}
+
+// IPv4HeaderLength creates a checker that checks the IPv4 Header length.
+func IPv4HeaderLength(headerLength int) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			if hl := ip.HeaderLength(); hl != uint8(headerLength) {
+				t.Errorf("Bad header length, got = %d, want = %d", hl, headerLength)
+			}
+		default:
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", ip)
 		}
 	}
 }
 
 // PayloadLen creates a checker that checks the payload length.
-func PayloadLen(plen int) NetworkChecker {
+func PayloadLen(payloadLength int) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		if l := len(h[0].Payload()); l != plen {
-			t.Errorf("Bad payload length, got %v, want %v", l, plen)
+		if l := len(h[0].Payload()); l != payloadLength {
+			t.Errorf("Bad payload length, got = %d, want = %d", l, payloadLength)
+		}
+	}
+}
+
+// IPPayload creates a checker that checks the payload.
+func IPPayload(payload []byte) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		got := h[0].Payload()
+
+		// cmp.Diff does not consider nil slices equal to empty slices, but we do.
+		if len(got) == 0 && len(payload) == 0 {
+			return
+		}
+
+		if diff := cmp.Diff(payload, got); diff != "" {
+			t.Errorf("payload mismatch (-want +got):\n%s", diff)
+		}
+	}
+}
+
+// IPv4Options returns a checker that checks the options in an IPv4 packet.
+func IPv4Options(want []byte) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		ip, ok := h[0].(header.IPv4)
+		if !ok {
+			t.Fatalf("unexpected network header passed to checker, got = %T, want = header.IPv4", h[0])
+		}
+		options := ip.Options()
+		// cmp.Diff does not consider nil slices equal to empty slices, but we do.
+		if len(want) == 0 && len(options) == 0 {
+			return
+		}
+		if diff := cmp.Diff(want, options); diff != "" {
+			t.Errorf("options mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
@@ -139,11 +221,11 @@ func FragmentOffset(offset uint16) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		// We only do this of IPv4 for now.
+		// We only do this for IPv4 for now.
 		switch ip := h[0].(type) {
 		case header.IPv4:
 			if v := ip.FragmentOffset(); v != offset {
-				t.Errorf("Bad fragment offset, got %v, want %v", v, offset)
+				t.Errorf("Bad fragment offset, got = %d, want = %d", v, offset)
 			}
 		}
 	}
@@ -154,11 +236,11 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
 
-		// We only do this of IPv4 for now.
+		// We only do this for IPv4 for now.
 		switch ip := h[0].(type) {
 		case header.IPv4:
 			if v := ip.Flags(); v != flags {
-				t.Errorf("Bad fragment offset, got %v, want %v", v, flags)
+				t.Errorf("Bad fragment offset, got = %d, want = %d", v, flags)
 			}
 		}
 	}
@@ -208,7 +290,7 @@ func TOS(tos uint8, label uint32) NetworkChecker {
 		t.Helper()
 
 		if v, l := h[0].TOS(); v != tos || l != label {
-			t.Errorf("Bad TOS, got (%v, %v), want (%v,%v)", v, l, tos, label)
+			t.Errorf("Bad TOS, got = (%d, %d), want = (%d,%d)", v, l, tos, label)
 		}
 	}
 }
@@ -234,7 +316,7 @@ func IPv6Fragment(checkers ...NetworkChecker) NetworkChecker {
 		t.Helper()
 
 		if p := h[0].TransportProtocol(); p != header.IPv6FragmentHeader {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.UDPProtocolNumber)
 		}
 
 		ipv6Frag := header.IPv6Fragment(h[0].Payload())
@@ -261,7 +343,7 @@ func TCP(checkers ...TransportChecker) NetworkChecker {
 		last := h[len(h)-1]
 
 		if p := last.TransportProtocol(); p != header.TCPProtocolNumber {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.TCPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.TCPProtocolNumber)
 		}
 
 		// Verify the checksum.
@@ -297,7 +379,7 @@ func UDP(checkers ...TransportChecker) NetworkChecker {
 		last := h[len(h)-1]
 
 		if p := last.TransportProtocol(); p != header.UDPProtocolNumber {
-			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+			t.Errorf("Bad protocol, got = %d, want = %d", p, header.UDPProtocolNumber)
 		}
 
 		udp := header.UDP(last.Payload())
@@ -316,7 +398,7 @@ func SrcPort(port uint16) TransportChecker {
 		t.Helper()
 
 		if p := h.SourcePort(); p != port {
-			t.Errorf("Bad source port, got %v, want %v", p, port)
+			t.Errorf("Bad source port, got = %d, want = %d", p, port)
 		}
 	}
 }
@@ -327,7 +409,7 @@ func DstPort(port uint16) TransportChecker {
 		t.Helper()
 
 		if p := h.DestinationPort(); p != port {
-			t.Errorf("Bad destination port, got %v, want %v", p, port)
+			t.Errorf("Bad destination port, got = %d, want = %d", p, port)
 		}
 	}
 }
@@ -339,7 +421,7 @@ func NoChecksum(noChecksum bool) TransportChecker {
 
 		udp, ok := h.(header.UDP)
 		if !ok {
-			return
+			t.Fatalf("UDP header not found in h: %T", h)
 		}
 
 		if b := udp.Checksum() == 0; b != noChecksum {
@@ -348,50 +430,84 @@ func NoChecksum(noChecksum bool) TransportChecker {
 	}
 }
 
-// SeqNum creates a checker that checks the sequence number.
-func SeqNum(seq uint32) TransportChecker {
+// TCPSeqNum creates a checker that checks the sequence number.
+func TCPSeqNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if s := tcp.SequenceNumber(); s != seq {
-			t.Errorf("Bad sequence number, got %v, want %v", s, seq)
+			t.Errorf("Bad sequence number, got = %d, want = %d", s, seq)
 		}
 	}
 }
 
-// AckNum creates a checker that checks the ack number.
-func AckNum(seq uint32) TransportChecker {
+// TCPAckNum creates a checker that checks the ack number.
+func TCPAckNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if s := tcp.AckNumber(); s != seq {
-			t.Errorf("Bad ack number, got %v, want %v", s, seq)
+			t.Errorf("Bad ack number, got = %d, want = %d", s, seq)
 		}
 	}
 }
 
-// Window creates a checker that checks the tcp window.
-func Window(window uint16) TransportChecker {
+// TCPWindow creates a checker that checks the tcp window.
+func TCPWindow(window uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in hdr : %T", h)
 		}
 
 		if w := tcp.WindowSize(); w != window {
-			t.Errorf("Bad window, got 0x%x, want 0x%x", w, window)
+			t.Errorf("Bad window, got %d, want %d", w, window)
+		}
+	}
+}
+
+// TCPWindowGreaterThanEq creates a checker that checks that the TCP window
+// is greater than or equal to the provided value.
+func TCPWindowGreaterThanEq(window uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			t.Fatalf("TCP header not found in h: %T", h)
+		}
+
+		if w := tcp.WindowSize(); w < window {
+			t.Errorf("Bad window, got %d, want > %d", w, window)
+		}
+	}
+}
+
+// TCPWindowLessThanEq creates a checker that checks that the tcp window
+// is less than or equal to the provided value.
+func TCPWindowLessThanEq(window uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			t.Fatalf("TCP header not found in h: %T", h)
+		}
+
+		if w := tcp.WindowSize(); w > window {
+			t.Errorf("Bad window, got %d, want < %d", w, window)
 		}
 	}
 }
@@ -403,7 +519,7 @@ func TCPFlags(flags uint8) TransportChecker {
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if f := tcp.Flags(); f != flags {
@@ -420,7 +536,7 @@ func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 
 		tcp, ok := h.(header.TCP)
 		if !ok {
-			return
+			t.Fatalf("TCP header not found in h: %T", h)
 		}
 
 		if f := tcp.Flags(); (f & mask) != (flags & mask) {
@@ -458,7 +574,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 			case header.TCPOptionMSS:
 				v := uint16(opts[i+2])<<8 | uint16(opts[i+3])
 				if wantOpts.MSS != v {
-					t.Errorf("Bad MSS: got %v, want %v", v, wantOpts.MSS)
+					t.Errorf("Bad MSS, got = %d, want = %d", v, wantOpts.MSS)
 				}
 				foundMSS = true
 				i += 4
@@ -468,7 +584,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 				}
 				v := int(opts[i+2])
 				if v != wantOpts.WS {
-					t.Errorf("Bad WS: got %v, want %v", v, wantOpts.WS)
+					t.Errorf("Bad WS, got = %d, want = %d", v, wantOpts.WS)
 				}
 				foundWS = true
 				i += 3
@@ -517,7 +633,7 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 			t.Error("TS option specified but the timestamp value is zero")
 		}
 		if foundTS && tsEcr == 0 && wantOpts.TSEcr != 0 {
-			t.Errorf("TS option specified but TSEcr is incorrect: got %d, want: %d", tsEcr, wantOpts.TSEcr)
+			t.Errorf("TS option specified but TSEcr is incorrect, got = %d, want = %d", tsEcr, wantOpts.TSEcr)
 		}
 		if wantOpts.SACKPermitted && !foundSACKPermitted {
 			t.Errorf("SACKPermitted option not found. Options: %x", opts)
@@ -555,7 +671,7 @@ func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) Transp
 					t.Errorf("TS option found, but option is truncated, option length: %d, want 10 bytes", limit-i)
 				}
 				if opts[i+1] != 10 {
-					t.Errorf("TS option found, but bad length specified: %d, want: 10", opts[i+1])
+					t.Errorf("TS option found, but bad length specified: got = %d, want = 10", opts[i+1])
 				}
 				tsVal = binary.BigEndian.Uint32(opts[i+2:])
 				tsEcr = binary.BigEndian.Uint32(opts[i+6:])
@@ -575,19 +691,19 @@ func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) Transp
 		}
 
 		if wantTS != foundTS {
-			t.Errorf("TS Option mismatch: got TS= %v, want TS= %v", foundTS, wantTS)
+			t.Errorf("TS Option mismatch, got TS= %t, want TS= %t", foundTS, wantTS)
 		}
 		if wantTS && wantTSVal != 0 && wantTSVal != tsVal {
-			t.Errorf("Timestamp value is incorrect: got: %d, want: %d", tsVal, wantTSVal)
+			t.Errorf("Timestamp value is incorrect, got = %d, want = %d", tsVal, wantTSVal)
 		}
 		if wantTS && wantTSEcr != 0 && tsEcr != wantTSEcr {
-			t.Errorf("Timestamp Echo Reply is incorrect: got: %d, want: %d", tsEcr, wantTSEcr)
+			t.Errorf("Timestamp Echo Reply is incorrect, got = %d, want = %d", tsEcr, wantTSEcr)
 		}
 	}
 }
 
-// TCPNoSACKBlockChecker creates a checker that verifies that the segment does not
-// contain any SACK blocks in the TCP options.
+// TCPNoSACKBlockChecker creates a checker that verifies that the segment does
+// not contain any SACK blocks in the TCP options.
 func TCPNoSACKBlockChecker() TransportChecker {
 	return TCPSACKBlockChecker(nil)
 }
@@ -645,7 +761,7 @@ func TCPSACKBlockChecker(sackBlocks []header.SACKBlock) TransportChecker {
 		}
 
 		if !reflect.DeepEqual(gotSACKBlocks, sackBlocks) {
-			t.Errorf("SACKBlocks are not equal, got: %v, want: %v", gotSACKBlocks, sackBlocks)
+			t.Errorf("SACKBlocks are not equal, got = %v, want = %v", gotSACKBlocks, sackBlocks)
 		}
 	}
 }
@@ -661,8 +777,8 @@ func Payload(want []byte) TransportChecker {
 	}
 }
 
-// ICMPv4 creates a checker that checks that the transport protocol is ICMPv4 and
-// potentially additional ICMPv4 header fields.
+// ICMPv4 creates a checker that checks that the transport protocol is ICMPv4
+// and potentially additional ICMPv4 header fields.
 func ICMPv4(checkers ...TransportChecker) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
 		t.Helper()
@@ -690,10 +806,10 @@ func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
 		}
 		if got := icmpv4.Type(); got != want {
-			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+			t.Fatalf("unexpected icmp type, got = %d, want = %d", got, want)
 		}
 	}
 }
@@ -705,10 +821,76 @@ func ICMPv4Code(want header.ICMPv4Code) TransportChecker {
 
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
 		}
 		if got := icmpv4.Code(); got != want {
-			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+			t.Fatalf("unexpected ICMP code, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Ident creates a checker that checks the ICMPv4 echo Ident.
+func ICMPv4Ident(want uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		if got := icmpv4.Ident(); got != want {
+			t.Fatalf("unexpected ICMP ident, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Seq creates a checker that checks the ICMPv4 echo Sequence.
+func ICMPv4Seq(want uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		if got := icmpv4.Sequence(); got != want {
+			t.Fatalf("unexpected ICMP sequence, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Checksum creates a checker that checks the ICMPv4 Checksum.
+// This assumes that the payload exactly makes up the rest of the slice.
+func ICMPv4Checksum() TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		heldChecksum := icmpv4.Checksum()
+		icmpv4.SetChecksum(0)
+		newChecksum := ^header.Checksum(icmpv4, 0)
+		icmpv4.SetChecksum(heldChecksum)
+		if heldChecksum != newChecksum {
+			t.Errorf("unexpected ICMP checksum, got = %d, want = %d", heldChecksum, newChecksum)
+		}
+	}
+}
+
+// ICMPv4Payload creates a checker that checks the payload in an ICMPv4 packet.
+func ICMPv4Payload(want []byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv4", h)
+		}
+		payload := icmpv4.Payload()
+		if diff := cmp.Diff(want, payload); diff != "" {
+			t.Errorf("ICMP payload mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
@@ -748,10 +930,10 @@ func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
 		}
 		if got := icmpv6.Type(); got != want {
-			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+			t.Fatalf("unexpected icmp type, got = %d, want = %d", got, want)
 		}
 	}
 }
@@ -763,10 +945,42 @@ func ICMPv6Code(want header.ICMPv6Code) TransportChecker {
 
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
-			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
 		}
 		if got := icmpv6.Code(); got != want {
-			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+			t.Fatalf("unexpected ICMP code, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv6TypeSpecific creates a checker that checks the ICMPv6 TypeSpecific
+// field.
+func ICMPv6TypeSpecific(want uint32) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv6, ok := h.(header.ICMPv6)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
+		}
+		if got := icmpv6.TypeSpecific(); got != want {
+			t.Fatalf("unexpected ICMP TypeSpecific, got = %d, want = %d", got, want)
+		}
+	}
+}
+
+// ICMPv6Payload creates a checker that checks the payload in an ICMPv6 packet.
+func ICMPv6Payload(want []byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv6, ok := h.(header.ICMPv6)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker, got = %T, want = header.ICMPv6", h)
+		}
+		payload := icmpv6.Payload()
+		if diff := cmp.Diff(want, payload); diff != "" {
+			t.Errorf("ICMP payload mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
diff --git a/pkg/tcpip/faketime/BUILD b/pkg/tcpip/faketime/BUILD
new file mode 100644
index 000000000..114d43df3
--- /dev/null
+++ b/pkg/tcpip/faketime/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "faketime",
+    srcs = ["faketime.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "@com_github_dpjacques_clockwork//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "faketime_test",
+    size = "small",
+    srcs = [
+        "faketime_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip/faketime",
+    ],
+)
diff --git a/pkg/tcpip/stack/fake_time_test.go b/pkg/tcpip/faketime/faketime.go
index 92c8cb534..f7a4fbde1 100644
--- a/pkg/tcpip/stack/fake_time_test.go
+++ b/pkg/tcpip/faketime/faketime.go
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package stack
+// Package faketime provides a fake clock that implements tcpip.Clock interface.
+package faketime
 
 import (
 	"container/heap"
@@ -23,7 +24,29 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
-type fakeClock struct {
+// NullClock implements a clock that never advances.
+type NullClock struct{}
+
+var _ tcpip.Clock = (*NullClock)(nil)
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (*NullClock) NowNanoseconds() int64 {
+	return 0
+}
+
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (*NullClock) NowMonotonic() int64 {
+	return 0
+}
+
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (*NullClock) AfterFunc(time.Duration, func()) tcpip.Timer {
+	return nil
+}
+
+// ManualClock implements tcpip.Clock and only advances manually with Advance
+// method.
+type ManualClock struct {
 	clock clockwork.FakeClock
 
 	// mu protects the fields below.
@@ -39,34 +62,35 @@ type fakeClock struct {
 	waitGroups map[time.Time]*sync.WaitGroup
 }
 
-func newFakeClock() *fakeClock {
-	return &fakeClock{
+// NewManualClock creates a new ManualClock instance.
+func NewManualClock() *ManualClock {
+	return &ManualClock{
 		clock:      clockwork.NewFakeClock(),
 		times:      &timeHeap{},
 		waitGroups: make(map[time.Time]*sync.WaitGroup),
 	}
 }
 
-var _ tcpip.Clock = (*fakeClock)(nil)
+var _ tcpip.Clock = (*ManualClock)(nil)
 
 // NowNanoseconds implements tcpip.Clock.NowNanoseconds.
-func (fc *fakeClock) NowNanoseconds() int64 {
-	return fc.clock.Now().UnixNano()
+func (mc *ManualClock) NowNanoseconds() int64 {
+	return mc.clock.Now().UnixNano()
 }
 
 // NowMonotonic implements tcpip.Clock.NowMonotonic.
-func (fc *fakeClock) NowMonotonic() int64 {
-	return fc.NowNanoseconds()
+func (mc *ManualClock) NowMonotonic() int64 {
+	return mc.NowNanoseconds()
 }
 
 // AfterFunc implements tcpip.Clock.AfterFunc.
-func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
-	until := fc.clock.Now().Add(d)
-	wg := fc.addWait(until)
-	return &fakeTimer{
-		clock: fc,
+func (mc *ManualClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+	until := mc.clock.Now().Add(d)
+	wg := mc.addWait(until)
+	return &manualTimer{
+		clock: mc,
 		until: until,
-		timer: fc.clock.AfterFunc(d, func() {
+		timer: mc.clock.AfterFunc(d, func() {
 			defer wg.Done()
 			f()
 		}),
@@ -75,110 +99,113 @@ func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
 
 // addWait adds an additional wait to the WaitGroup for parallel execution of
 // all work scheduled for t. Returns a reference to the WaitGroup modified.
-func (fc *fakeClock) addWait(t time.Time) *sync.WaitGroup {
-	fc.mu.RLock()
-	wg, ok := fc.waitGroups[t]
-	fc.mu.RUnlock()
+func (mc *ManualClock) addWait(t time.Time) *sync.WaitGroup {
+	mc.mu.RLock()
+	wg, ok := mc.waitGroups[t]
+	mc.mu.RUnlock()
 
 	if ok {
 		wg.Add(1)
 		return wg
 	}
 
-	fc.mu.Lock()
-	heap.Push(fc.times, t)
-	fc.mu.Unlock()
+	mc.mu.Lock()
+	heap.Push(mc.times, t)
+	mc.mu.Unlock()
 
 	wg = &sync.WaitGroup{}
 	wg.Add(1)
 
-	fc.mu.Lock()
-	fc.waitGroups[t] = wg
-	fc.mu.Unlock()
+	mc.mu.Lock()
+	mc.waitGroups[t] = wg
+	mc.mu.Unlock()
 
 	return wg
 }
 
 // removeWait removes a wait from the WaitGroup for parallel execution of all
 // work scheduled for t.
-func (fc *fakeClock) removeWait(t time.Time) {
-	fc.mu.RLock()
-	defer fc.mu.RUnlock()
+func (mc *ManualClock) removeWait(t time.Time) {
+	mc.mu.RLock()
+	defer mc.mu.RUnlock()
 
-	wg := fc.waitGroups[t]
+	wg := mc.waitGroups[t]
 	wg.Done()
 }
 
-// advance executes all work that have been scheduled to execute within d from
-// the current fake time. Blocks until all work has completed execution.
-func (fc *fakeClock) advance(d time.Duration) {
+// Advance executes all work that have been scheduled to execute within d from
+// the current  time. Blocks until all work has completed execution.
+func (mc *ManualClock) Advance(d time.Duration) {
 	// Block until all the work is done
-	until := fc.clock.Now().Add(d)
+	until := mc.clock.Now().Add(d)
 	for {
-		fc.mu.Lock()
-		if fc.times.Len() == 0 {
-			fc.mu.Unlock()
-			return
+		mc.mu.Lock()
+		if mc.times.Len() == 0 {
+			mc.mu.Unlock()
+			break
 		}
 
-		t := heap.Pop(fc.times).(time.Time)
+		t := heap.Pop(mc.times).(time.Time)
 		if t.After(until) {
 			// No work to do
-			heap.Push(fc.times, t)
-			fc.mu.Unlock()
-			return
+			heap.Push(mc.times, t)
+			mc.mu.Unlock()
+			break
 		}
-		fc.mu.Unlock()
+		mc.mu.Unlock()
 
-		diff := t.Sub(fc.clock.Now())
-		fc.clock.Advance(diff)
+		diff := t.Sub(mc.clock.Now())
+		mc.clock.Advance(diff)
 
-		fc.mu.RLock()
-		wg := fc.waitGroups[t]
-		fc.mu.RUnlock()
+		mc.mu.RLock()
+		wg := mc.waitGroups[t]
+		mc.mu.RUnlock()
 
 		wg.Wait()
 
-		fc.mu.Lock()
-		delete(fc.waitGroups, t)
-		fc.mu.Unlock()
+		mc.mu.Lock()
+		delete(mc.waitGroups, t)
+		mc.mu.Unlock()
+	}
+	if now := mc.clock.Now(); until.After(now) {
+		mc.clock.Advance(until.Sub(now))
 	}
 }
 
-type fakeTimer struct {
-	clock *fakeClock
+type manualTimer struct {
+	clock *ManualClock
 	timer clockwork.Timer
 
 	mu    sync.RWMutex
 	until time.Time
 }
 
-var _ tcpip.Timer = (*fakeTimer)(nil)
+var _ tcpip.Timer = (*manualTimer)(nil)
 
 // Reset implements tcpip.Timer.Reset.
-func (ft *fakeTimer) Reset(d time.Duration) {
-	if !ft.timer.Reset(d) {
+func (t *manualTimer) Reset(d time.Duration) {
+	if !t.timer.Reset(d) {
 		return
 	}
 
-	ft.mu.Lock()
-	defer ft.mu.Unlock()
+	t.mu.Lock()
+	defer t.mu.Unlock()
 
-	ft.clock.removeWait(ft.until)
-	ft.until = ft.clock.clock.Now().Add(d)
-	ft.clock.addWait(ft.until)
+	t.clock.removeWait(t.until)
+	t.until = t.clock.clock.Now().Add(d)
+	t.clock.addWait(t.until)
 }
 
 // Stop implements tcpip.Timer.Stop.
-func (ft *fakeTimer) Stop() bool {
-	if !ft.timer.Stop() {
+func (t *manualTimer) Stop() bool {
+	if !t.timer.Stop() {
 		return false
 	}
 
-	ft.mu.RLock()
-	defer ft.mu.RUnlock()
+	t.mu.RLock()
+	defer t.mu.RUnlock()
 
-	ft.clock.removeWait(ft.until)
+	t.clock.removeWait(t.until)
 	return true
 }
 
diff --git a/pkg/tcpip/faketime/faketime_test.go b/pkg/tcpip/faketime/faketime_test.go
new file mode 100644
index 000000000..c2704df2c
--- /dev/null
+++ b/pkg/tcpip/faketime/faketime_test.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package faketime_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+)
+
+func TestManualClockAdvance(t *testing.T) {
+	const timeout = time.Millisecond
+	clock := faketime.NewManualClock()
+	start := clock.NowMonotonic()
+	clock.Advance(timeout)
+	if got, want := time.Duration(clock.NowMonotonic()-start)*time.Nanosecond, timeout; got != want {
+		t.Errorf("got = %d, want = %d", got, want)
+	}
+}
+
+func TestManualClockAfterFunc(t *testing.T) {
+	const (
+		timeout1 = time.Millisecond     // timeout for counter1
+		timeout2 = 2 * time.Millisecond // timeout for counter2
+	)
+	tests := []struct {
+		name         string
+		advance      time.Duration
+		wantCounter1 int
+		wantCounter2 int
+	}{
+		{
+			name:         "before timeout1",
+			advance:      timeout1 - 1,
+			wantCounter1: 0,
+			wantCounter2: 0,
+		},
+		{
+			name:         "timeout1",
+			advance:      timeout1,
+			wantCounter1: 1,
+			wantCounter2: 0,
+		},
+		{
+			name:         "timeout2",
+			advance:      timeout2,
+			wantCounter1: 1,
+			wantCounter2: 1,
+		},
+		{
+			name:         "after timeout2",
+			advance:      timeout2 + 1,
+			wantCounter1: 1,
+			wantCounter2: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			counter1 := 0
+			counter2 := 0
+			clock.AfterFunc(timeout1, func() {
+				counter1++
+			})
+			clock.AfterFunc(timeout2, func() {
+				counter2++
+			})
+			start := clock.NowMonotonic()
+			clock.Advance(test.advance)
+			if got, want := counter1, test.wantCounter1; got != want {
+				t.Errorf("got counter1 = %d, want = %d", got, want)
+			}
+			if got, want := counter2, test.wantCounter2; got != want {
+				t.Errorf("got counter2 = %d, want = %d", got, want)
+			}
+			if got, want := time.Duration(clock.NowMonotonic()-start)*time.Nanosecond, test.advance; got != want {
+				t.Errorf("got elapsed = %d, want = %d", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index eaface8cb..95ade0e5c 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -117,25 +117,31 @@ func (b Ethernet) Encode(e *EthernetFields) {
 	copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
 }
 
-// IsValidUnicastEthernetAddress returns true if addr is a valid unicast
+// IsMulticastEthernetAddress returns true if the address is a multicast
+// ethernet address.
+func IsMulticastEthernetAddress(addr tcpip.LinkAddress) bool {
+	if len(addr) != EthernetAddressSize {
+		return false
+	}
+
+	return addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0
+}
+
+// IsValidUnicastEthernetAddress returns true if the address is a unicast
 // ethernet address.
 func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
-	// Must be of the right length.
 	if len(addr) != EthernetAddressSize {
 		return false
 	}
 
-	// Must not be unspecified.
 	if addr == unspecifiedEthernetAddress {
 		return false
 	}
 
-	// Must not be a multicast.
 	if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
 		return false
 	}
 
-	// addr is a valid unicast ethernet address.
 	return true
 }
 
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 14413f2ce..3bc8b2b21 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -67,6 +67,53 @@ func TestIsValidUnicastEthernetAddress(t *testing.T) {
 	}
 }
 
+func TestIsMulticastEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.LinkAddress
+		expected bool
+	}{
+		{
+			"Nil",
+			tcpip.LinkAddress([]byte(nil)),
+			false,
+		},
+		{
+			"Empty",
+			tcpip.LinkAddress(""),
+			false,
+		},
+		{
+			"InvalidLength",
+			tcpip.LinkAddress("\x01\x02\x03"),
+			false,
+		},
+		{
+			"Unspecified",
+			unspecifiedEthernetAddress,
+			false,
+		},
+		{
+			"Multicast",
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			true,
+		},
+		{
+			"Unicast",
+			tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06"),
+			false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := IsMulticastEthernetAddress(test.addr); got != test.expected {
+				t.Fatalf("got IsMulticastEthernetAddress = %t, want = %t", got, test.expected)
+			}
+		})
+	}
+}
+
 func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
 	tests := []struct {
 		name             string
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index be03fb086..0f5fb3964 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -31,6 +31,27 @@ const (
 	// ICMPv4MinimumSize is the minimum size of a valid ICMP packet.
 	ICMPv4MinimumSize = 8
 
+	// ICMPv4MinimumErrorPayloadSize Is the smallest number of bytes of an
+	// errant packet's transport layer that an ICMP error type packet should
+	// attempt to send as per RFC 792 (see each type) and RFC 1122
+	// section 3.2.2 which states:
+	//      Every ICMP error message includes the Internet header and at
+	//      least the first 8 data octets of the datagram that triggered
+	//      the error; more than 8 octets MAY be sent; this header and data
+	//      MUST be unchanged from the received datagram.
+	//
+	// RFC 792 shows:
+	//   0                   1                   2                   3
+	//  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |     Type      |     Code      |          Checksum             |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |                             unused                            |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	// |      Internet Header + 64 bits of Original Data Datagram      |
+	// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	ICMPv4MinimumErrorPayloadSize = 8
+
 	// ICMPv4ProtocolNumber is the ICMP transport protocol number.
 	ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1
 
@@ -39,15 +60,19 @@ const (
 	icmpv4ChecksumOffset = 2
 
 	// icmpv4MTUOffset is the offset of the MTU field
-	// in a ICMPv4FragmentationNeeded message.
+	// in an ICMPv4FragmentationNeeded message.
 	icmpv4MTUOffset = 6
 
 	// icmpv4IdentOffset is the offset of the ident field
-	// in a ICMPv4EchoRequest/Reply message.
+	// in an ICMPv4EchoRequest/Reply message.
 	icmpv4IdentOffset = 4
 
+	// icmpv4PointerOffset is the offset of the pointer field
+	// in an ICMPv4ParamProblem message.
+	icmpv4PointerOffset = 4
+
 	// icmpv4SequenceOffset is the offset of the sequence field
-	// in a ICMPv4EchoRequest/Reply message.
+	// in an ICMPv4EchoRequest/Reply message.
 	icmpv4SequenceOffset = 6
 )
 
@@ -72,15 +97,24 @@ const (
 	ICMPv4InfoReply      ICMPv4Type = 16
 )
 
+// ICMP codes for ICMPv4 Time Exceeded messages as defined in RFC 792.
+const (
+	ICMPv4TTLExceeded       ICMPv4Code = 0
+	ICMPv4ReassemblyTimeout ICMPv4Code = 1
+)
+
 // ICMP codes for ICMPv4 Destination Unreachable messages as defined in RFC 792.
 const (
-	ICMPv4TTLExceeded         ICMPv4Code = 0
+	ICMPv4NetUnreachable      ICMPv4Code = 0
 	ICMPv4HostUnreachable     ICMPv4Code = 1
 	ICMPv4ProtoUnreachable    ICMPv4Code = 2
 	ICMPv4PortUnreachable     ICMPv4Code = 3
 	ICMPv4FragmentationNeeded ICMPv4Code = 4
 )
 
+// ICMPv4UnusedCode is a code to use in ICMP messages where no code is needed.
+const ICMPv4UnusedCode ICMPv4Code = 0
+
 // Type is the ICMP type field.
 func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) }
 
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 20b01d8f4..4303fc5d5 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -49,14 +49,17 @@ const (
 	// neighbor advertisement packet.
 	ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
 
-	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
-	// including the NDP Target Link Layer option for an Ethernet
-	// address.
-	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + NDPLinkLayerAddressSize
-
-	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
+	// ICMPv6EchoMinimumSize is the minimum size of a valid echo packet.
 	ICMPv6EchoMinimumSize = 8
 
+	// ICMPv6ErrorHeaderSize is the size of an ICMP error packet header,
+	// as per RFC 4443, Apendix A, item 4 and the errata.
+	//   ... all ICMP error messages shall have exactly
+	//   32 bits of type-specific data, so that receivers can reliably find
+	//   the embedded invoking packet even when they don't recognize the
+	//   ICMP message Type.
+	ICMPv6ErrorHeaderSize = 8
+
 	// ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP
 	// destination unreachable packet.
 	ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize
@@ -69,6 +72,10 @@ const (
 	// in an ICMPv6 message.
 	icmpv6ChecksumOffset = 2
 
+	// icmpv6PointerOffset is the offset of the pointer
+	// in an ICMPv6 Parameter problem message.
+	icmpv6PointerOffset = 4
+
 	// icmpv6MTUOffset is the offset of the MTU field in an ICMPv6
 	// PacketTooBig message.
 	icmpv6MTUOffset = 4
@@ -89,9 +96,10 @@ const (
 	NDPHopLimit = 255
 )
 
-// ICMPv6Type is the ICMP type field described in RFC 4443 and friends.
+// ICMPv6Type is the ICMP type field described in RFC 4443.
 type ICMPv6Type byte
 
+// Values for use in the Type field of ICMPv6 packet from RFC 4433.
 const (
 	ICMPv6DstUnreachable ICMPv6Type = 1
 	ICMPv6PacketTooBig   ICMPv6Type = 2
@@ -109,7 +117,18 @@ const (
 	ICMPv6RedirectMsg     ICMPv6Type = 137
 )
 
-// ICMPv6Code is the ICMP code field described in RFC 4443.
+// IsErrorType returns true if the receiver is an ICMP error type.
+func (typ ICMPv6Type) IsErrorType() bool {
+	// Per RFC 4443 section 2.1:
+	//   ICMPv6 messages are grouped into two classes: error messages and
+	//   informational messages.  Error messages are identified as such by a
+	//   zero in the high-order bit of their message Type field values.  Thus,
+	//   error messages have message types from 0 to 127; informational
+	//   messages have message types from 128 to 255.
+	return typ&0x80 == 0
+}
+
+// ICMPv6Code is the ICMP Code field described in RFC 4443.
 type ICMPv6Code byte
 
 // ICMP codes used with Destination Unreachable (Type 1). As per RFC 4443
@@ -132,9 +151,14 @@ const (
 
 // ICMP codes used with Parameter Problem (Type 4). As per RFC 4443 section 3.4.
 const (
+	// ICMPv6ErroneousHeader indicates an erroneous header field was encountered.
 	ICMPv6ErroneousHeader ICMPv6Code = 0
-	ICMPv6UnknownHeader   ICMPv6Code = 1
-	ICMPv6UnknownOption   ICMPv6Code = 2
+
+	// ICMPv6UnknownHeader indicates an unrecognized Next Header type encountered.
+	ICMPv6UnknownHeader ICMPv6Code = 1
+
+	// ICMPv6UnknownOption indicates an unrecognized IPv6 option was encountered.
+	ICMPv6UnknownOption ICMPv6Code = 2
 )
 
 // ICMPv6UnusedCode is the code value used with ICMPv6 messages which don't use
@@ -153,6 +177,16 @@ func (b ICMPv6) Code() ICMPv6Code { return ICMPv6Code(b[1]) }
 // SetCode sets the ICMP code field.
 func (b ICMPv6) SetCode(c ICMPv6Code) { b[1] = byte(c) }
 
+// TypeSpecific returns the type specific data field.
+func (b ICMPv6) TypeSpecific() uint32 {
+	return binary.BigEndian.Uint32(b[icmpv6PointerOffset:])
+}
+
+// SetTypeSpecific sets the type specific data field.
+func (b ICMPv6) SetTypeSpecific(val uint32) {
+	binary.BigEndian.PutUint32(b[icmpv6PointerOffset:], val)
+}
+
 // Checksum is the ICMP checksum field.
 func (b ICMPv6) Checksum() uint16 {
 	return binary.BigEndian.Uint16(b[icmpv6ChecksumOffset:])
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 680eafd16..ee307d163 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -16,10 +16,29 @@ package header
 
 import (
 	"encoding/binary"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// RFC 971 defines the fields of the IPv4 header on page 11 using the following
+// diagram: ("Figure 4")
+//    0                   1                   2                   3
+//    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |Version|  IHL  |Type of Service|          Total Length         |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |         Identification        |Flags|      Fragment Offset    |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |  Time to Live |    Protocol   |         Header Checksum       |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                       Source Address                          |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                    Destination Address                        |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |                    Options                    |    Padding    |
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
 const (
 	versIHL = 0
 	tos     = 1
@@ -33,6 +52,7 @@ const (
 	checksum           = 10
 	srcAddr            = 12
 	dstAddr            = 16
+	options            = 20
 )
 
 // IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
@@ -76,11 +96,13 @@ type IPv4Fields struct {
 // IPv4 represents an ipv4 header stored in a byte array.
 // Most of the methods of IPv4 access to the underlying slice without
 // checking the boundaries and could panic because of 'index out of range'.
-// Always call IsValid() to validate an instance of IPv4 before using other methods.
+// Always call IsValid() to validate an instance of IPv4 before using other
+// methods.
 type IPv4 []byte
 
 const (
-	// IPv4MinimumSize is the minimum size of a valid IPv4 packet.
+	// IPv4MinimumSize is the minimum size of a valid IPv4 packet;
+	// i.e. a packet header with no options.
 	IPv4MinimumSize = 20
 
 	// IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given
@@ -88,6 +110,16 @@ const (
 	// units, the header cannot exceed 15*4 = 60 bytes.
 	IPv4MaximumHeaderSize = 60
 
+	// IPv4MaximumPayloadSize is the maximum size of a valid IPv4 payload.
+	//
+	// Linux limits this to 65,515 octets (the max IP datagram size - the IPv4
+	// header size). But RFC 791 section 3.2 discusses the design of the IPv4
+	// fragment "allows 2**13 = 8192 fragments of 8 octets each for a total of
+	// 65,536 octets. Note that this is consistent with the the datagram total
+	// length field (of course, the header is counted in the total length and not
+	// in the fragments)."
+	IPv4MaximumPayloadSize = 65536
+
 	// MinIPFragmentPayloadSize is the minimum number of payload bytes that
 	// the first fragment must carry when an IPv4 packet is fragmented.
 	MinIPFragmentPayloadSize = 8
@@ -116,6 +148,13 @@ const (
 	// packet that every IPv4 capable host must be able to
 	// process/reassemble.
 	IPv4MinimumProcessableDatagramSize = 576
+
+	// IPv4MinimumMTU is the minimum MTU required by IPv4, per RFC 791,
+	// section 3.2:
+	//   Every internet module must be able to forward a datagram of 68 octets
+	//   without further fragmentation.  This is because an internet header may be
+	//   up to 60 octets, and the minimum fragment is 8 octets.
+	IPv4MinimumMTU = 68
 )
 
 // Flags that may be set in an IPv4 packet.
@@ -140,13 +179,44 @@ func IPVersion(b []byte) int {
 	if len(b) < versIHL+1 {
 		return -1
 	}
-	return int(b[versIHL] >> 4)
+	return int(b[versIHL] >> ipVersionShift)
 }
 
+// RFC 791 page 11 shows the header length (IHL) is in the lower 4 bits
+// of the first byte, and is counted in multiples of 4 bytes.
+//
+//     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//    |Version|  IHL  |Type of Service|          Total Length         |
+//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//      (...)
+//     Version:  4 bits
+//       The Version field indicates the format of the internet header.  This
+//       document describes version 4.
+//
+//     IHL:  4 bits
+//       Internet Header Length is the length of the internet header in 32
+//       bit words, and thus points to the beginning of the data.  Note that
+//       the minimum value for a correct header is 5.
+//
+const (
+	ipVersionShift = 4
+	ipIHLMask      = 0x0f
+	IPv4IHLStride  = 4
+)
+
 // HeaderLength returns the value of the "header length" field of the ipv4
 // header. The length returned is in bytes.
 func (b IPv4) HeaderLength() uint8 {
-	return (b[versIHL] & 0xf) * 4
+	return (b[versIHL] & ipIHLMask) * IPv4IHLStride
+}
+
+// SetHeaderLength sets the value of the "Internet Header Length" field.
+func (b IPv4) SetHeaderLength(hdrLen uint8) {
+	if hdrLen > IPv4MaximumHeaderSize {
+		panic(fmt.Sprintf("got IPv4 Header size = %d, want <= %d", hdrLen, IPv4MaximumHeaderSize))
+	}
+	b[versIHL] = (IPv4Version << ipVersionShift) | ((hdrLen / IPv4IHLStride) & ipIHLMask)
 }
 
 // ID returns the value of the identifier field of the ipv4 header.
@@ -200,6 +270,12 @@ func (b IPv4) DestinationAddress() tcpip.Address {
 	return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize])
 }
 
+// Options returns a a buffer holding the options.
+func (b IPv4) Options() []byte {
+	hdrLen := b.HeaderLength()
+	return b[options:hdrLen:hdrLen]
+}
+
 // TransportProtocol implements Network.TransportProtocol.
 func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber {
 	return tcpip.TransportProtocolNumber(b.Protocol())
@@ -225,6 +301,11 @@ func (b IPv4) SetTOS(v uint8, _ uint32) {
 	b[tos] = v
 }
 
+// SetTTL sets the "Time to Live" field of the IPv4 header.
+func (b IPv4) SetTTL(v byte) {
+	b[ttl] = v
+}
+
 // SetTotalLength sets the "total length" field of the ipv4 header.
 func (b IPv4) SetTotalLength(totalLength uint16) {
 	binary.BigEndian.PutUint16(b[IPv4TotalLenOffset:], totalLength)
@@ -265,7 +346,7 @@ func (b IPv4) CalculateChecksum() uint16 {
 
 // Encode encodes all the fields of the ipv4 header.
 func (b IPv4) Encode(i *IPv4Fields) {
-	b[versIHL] = (4 << 4) | ((i.IHL / 4) & 0xf)
+	b.SetHeaderLength(i.IHL)
 	b[tos] = i.TOS
 	b.SetTotalLength(i.TotalLength)
 	binary.BigEndian.PutUint16(b[id:], i.ID)
@@ -317,7 +398,7 @@ func IsV4MulticastAddress(addr tcpip.Address) bool {
 }
 
 // IsV4LoopbackAddress determines if the provided address is an IPv4 loopback
-// address (belongs to 127.0.0.1/8 subnet).
+// address (belongs to 127.0.0.0/8 subnet). See RFC 1122 section 3.2.1.3.
 func IsV4LoopbackAddress(addr tcpip.Address) bool {
 	if len(addr) != IPv4AddressSize {
 		return false
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index ea3823898..09cb153b1 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -34,6 +34,9 @@ const (
 	hopLimit             = 7
 	v6SrcAddr            = 8
 	v6DstAddr            = v6SrcAddr + IPv6AddressSize
+
+	// IPv6FixedHeaderSize is the size of the fixed header.
+	IPv6FixedHeaderSize = v6DstAddr + IPv6AddressSize
 )
 
 // IPv6Fields contains the fields of an IPv6 packet. It is used to describe the
@@ -69,11 +72,15 @@ type IPv6 []byte
 
 const (
 	// IPv6MinimumSize is the minimum size of a valid IPv6 packet.
-	IPv6MinimumSize = 40
+	IPv6MinimumSize = IPv6FixedHeaderSize
 
 	// IPv6AddressSize is the size, in bytes, of an IPv6 address.
 	IPv6AddressSize = 16
 
+	// IPv6MaximumPayloadSize is the maximum size of a valid IPv6 payload per
+	// RFC 8200 Section 4.5.
+	IPv6MaximumPayloadSize = 65535
+
 	// IPv6ProtocolNumber is IPv6's network protocol number.
 	IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd
 
@@ -94,8 +101,10 @@ const (
 	// The address is ff02::2.
 	IPv6AllRoutersMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
-	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
-	// section 5.
+	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 8200,
+	// section 5:
+	//   IPv6 requires that every link in the Internet have an MTU of 1280 octets
+	//   or greater.  This is known as the IPv6 minimum link MTU.
 	IPv6MinimumMTU = 1280
 
 	// IPv6Loopback is the IPv6 Loopback address.
@@ -302,14 +311,21 @@ func IsV6UnicastAddress(addr tcpip.Address) bool {
 	return addr[0] != 0xff
 }
 
+const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"
+
 // SolicitedNodeAddr computes the solicited-node multicast address. This is
 // used for NDP. Described in RFC 4291. The argument must be a full-length IPv6
 // address.
 func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
-	const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"
 	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
 }
 
+// IsSolicitedNodeAddr determines whether the address is a solicited-node
+// multicast address.
+func IsSolicitedNodeAddr(addr tcpip.Address) bool {
+	return solicitedNodeMulticastPrefix == addr[:len(addr)-3]
+}
+
 // EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
 // from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
 //
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 3499d8399..583c2c5d3 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -149,6 +149,19 @@ func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
 // obtained before modification is no longer used.
 type IPv6OptionsExtHdrOptionsIterator struct {
 	reader bytes.Reader
+
+	// optionOffset is the number of bytes from the first byte of the
+	// options field to the beginning of the current option.
+	optionOffset uint32
+
+	// nextOptionOffset is the offset of the next option.
+	nextOptionOffset uint32
+}
+
+// OptionOffset returns the number of bytes parsed while processing the
+// option field of the current Extension Header.
+func (i *IPv6OptionsExtHdrOptionsIterator) OptionOffset() uint32 {
+	return i.optionOffset
 }
 
 // IPv6OptionUnknownAction is the action that must be taken if the processing
@@ -226,6 +239,7 @@ func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}
 // the options data, or an error occured.
 func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
 	for {
+		i.optionOffset = i.nextOptionOffset
 		temp, err := i.reader.ReadByte()
 		if err != nil {
 			// If we can't read the first byte of a new option, then we know the
@@ -238,6 +252,7 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error
 		// know the option does not have Length and Data fields. End processing of
 		// the Pad1 option and continue processing the buffer as a new option.
 		if id == ipv6Pad1ExtHdrOptionIdentifier {
+			i.nextOptionOffset = i.optionOffset + 1
 			continue
 		}
 
@@ -254,41 +269,40 @@ func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error
 			return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
 		}
 
-		// Special-case the variable length padding option to avoid a copy.
-		if id == ipv6PadNExtHdrOptionIdentifier {
-			// Do we have enough bytes in the reader for the PadN option?
-			if n := i.reader.Len(); n < int(length) {
-				// Reset the reader to effectively consume the remaining buffer.
-				i.reader.Reset(nil)
-
-				// We return the same error as if we failed to read a non-padding option
-				// so consumers of this iterator don't need to differentiate between
-				// padding and non-padding options.
-				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
-			}
+		// Do we have enough bytes in the reader for the next option?
+		if n := i.reader.Len(); n < int(length) {
+			// Reset the reader to effectively consume the remaining buffer.
+			i.reader.Reset(nil)
+
+			// We return the same error as if we failed to read a non-padding option
+			// so consumers of this iterator don't need to differentiate between
+			// padding and non-padding options.
+			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
+		}
+
+		i.nextOptionOffset = i.optionOffset + uint32(length) + 1 /* option ID */ + 1 /* length byte */
 
+		switch id {
+		case ipv6PadNExtHdrOptionIdentifier:
+			// Special-case the variable length padding option to avoid a copy.
 			if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
 				panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
 			}
-
-			// End processing of the PadN option and continue processing the buffer as
-			// a new option.
 			continue
-		}
-
-		bytes := make([]byte, length)
-		if n, err := io.ReadFull(&i.reader, bytes); err != nil {
-			// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
-			// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
-			// Length field found in the option.
-			if err == io.EOF {
-				err = io.ErrUnexpectedEOF
+		default:
+			bytes := make([]byte, length)
+			if n, err := io.ReadFull(&i.reader, bytes); err != nil {
+				// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
+				// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
+				// Length field found in the option.
+				if err == io.EOF {
+					err = io.ErrUnexpectedEOF
+				}
+
+				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
 			}
-
-			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
+			return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
 		}
-
-		return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
 	}
 }
 
@@ -382,6 +396,29 @@ type IPv6PayloadIterator struct {
 	// Indicates to the iterator that it should return the remaining payload as a
 	// raw payload on the next call to Next.
 	forceRaw bool
+
+	// headerOffset is the offset of the beginning of the current extension
+	// header starting from the beginning of the fixed header.
+	headerOffset uint32
+
+	// parseOffset is the byte offset into the current extension header of the
+	// field we are currently examining. It can be added to the header offset
+	// if the absolute offset within the packet is required.
+	parseOffset uint32
+
+	// nextOffset is the offset of the next header.
+	nextOffset uint32
+}
+
+// HeaderOffset returns the offset to the start of the extension
+// header most recently processed.
+func (i IPv6PayloadIterator) HeaderOffset() uint32 {
+	return i.headerOffset
+}
+
+// ParseOffset returns the number of bytes successfully parsed.
+func (i IPv6PayloadIterator) ParseOffset() uint32 {
+	return i.headerOffset + i.parseOffset
 }
 
 // MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
@@ -397,7 +434,8 @@ func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, pa
 		nextHdrIdentifier: nextHdrIdentifier,
 		payload:           payload.Clone(nil),
 		// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
-		reader: *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+		reader:     *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+		nextOffset: IPv6FixedHeaderSize,
 	}
 }
 
@@ -434,6 +472,8 @@ func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
 // Next is unable to return anything because the iterator has reached the end of
 // the payload, or an error occured.
 func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
+	i.headerOffset = i.nextOffset
+	i.parseOffset = 0
 	// We could be forced to return i as a raw header when the previous header was
 	// a fragment extension header as the data following the fragment extension
 	// header may not be complete.
@@ -461,7 +501,7 @@ func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
 		return IPv6RoutingExtHdr(bytes), false, nil
 	case IPv6FragmentExtHdrIdentifier:
 		var data [6]byte
-		// We ignore the returned bytes becauase we know the fragment extension
+		// We ignore the returned bytes because we know the fragment extension
 		// header specific data will fit in data.
 		nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
 		if err != nil {
@@ -519,10 +559,12 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP
 	if err != nil {
 		return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
 	}
+	i.parseOffset++
 
 	var length uint8
 	length, err = i.reader.ReadByte()
 	i.payload.TrimFront(1)
+
 	if err != nil {
 		if fragmentHdr {
 			return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
@@ -534,6 +576,17 @@ func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IP
 		length = 0
 	}
 
+	// Make parseOffset point to the first byte of the Extension Header
+	// specific data.
+	i.parseOffset++
+
+	// length is in 8 byte chunks but doesn't include the first one.
+	// See RFC 8200 for each header type, sections 4.3-4.6 and the requirement
+	// in section 4.8 for new extension headers at the top of page 24.
+	//   [ Hdr Ext Len ] ... Length of the Destination Options header in 8-octet
+	//   units, not including the first 8 octets.
+	i.nextOffset += uint32((length + 1) * ipv6ExtHdrLenBytesPerUnit)
+
 	bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
 	if bytes == nil {
 		bytes = make([]byte, bytesLen)
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index b5540bf66..17a49d4fa 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -22,7 +22,7 @@ import (
 
 func TestIPv4(t *testing.T) {
 	b := header.IPv4(make([]byte, header.IPv4MinimumSize))
-	b.Encode(&header.IPv4Fields{})
+	b.Encode(&header.IPv4Fields{IHL: header.IPv4MinimumSize})
 
 	const want = header.IPv4Version
 	if v := header.IPVersion(b); v != want {
diff --git a/pkg/tcpip/header/parse/BUILD b/pkg/tcpip/header/parse/BUILD
new file mode 100644
index 000000000..2adee9288
--- /dev/null
+++ b/pkg/tcpip/header/parse/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "parse",
+    srcs = ["parse.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go
new file mode 100644
index 000000000..5ca75c834
--- /dev/null
+++ b/pkg/tcpip/header/parse/parse.go
@@ -0,0 +1,168 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package parse provides utilities to parse packets.
+package parse
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// ARP populates pkt's network header with an ARP header found in
+// pkt.Data.
+//
+// Returns true if the header was successfully parsed.
+func ARP(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.NetworkHeader().Consume(header.ARPSize)
+	if ok {
+		pkt.NetworkProtocolNumber = header.ARPProtocolNumber
+	}
+	return ok
+}
+
+// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network
+// header with the IPv4 header.
+//
+// Returns true if the header was successfully parsed.
+func IPv4(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// Header may have options, determine the true header length.
+	headerLen := int(ipHdr.HeaderLength())
+	if headerLen < header.IPv4MinimumSize {
+		// TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
+		// order for the packet to be valid. Figure out if we want to reject this
+		// case.
+		headerLen = header.IPv4MinimumSize
+	}
+	hdr, ok = pkt.NetworkHeader().Consume(headerLen)
+	if !ok {
+		return false
+	}
+	ipHdr = header.IPv4(hdr)
+
+	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return true
+}
+
+// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network
+// header with the IPv6 header.
+func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, 0, 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	var nextHdr tcpip.TransportProtocolNumber
+	var extensionsSize int
+
+traverseExtensions:
+	for {
+		extHdr, done, err := it.Next()
+		if err != nil {
+			break
+		}
+
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			if fragID == 0 && fragOffset == 0 && !fragMore {
+				fragID = extHdr.ID()
+				fragOffset = extHdr.FragmentOffset()
+				fragMore = extHdr.More()
+			}
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader().
+	hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+
+	return nextHdr, fragID, fragOffset, fragMore, true
+}
+
+// UDP parses a UDP packet found in pkt.Data and populates pkt's transport
+// header with the UDP header.
+//
+// Returns true if the header was successfully parsed.
+func UDP(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
+	pkt.TransportProtocolNumber = header.UDPProtocolNumber
+	return ok
+}
+
+// TCP parses a TCP packet found in pkt.Data and populates pkt's transport
+// header with the TCP header.
+//
+// Returns true if the header was successfully parsed.
+func TCP(pkt *stack.PacketBuffer) bool {
+	// TCP header is variable length, peek at it first.
+	hdrLen := header.TCPMinimumSize
+	hdr, ok := pkt.Data.PullUp(hdrLen)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		// TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
+		// packets.
+		hdrLen = offset
+	}
+
+	_, ok = pkt.TransportHeader().Consume(hdrLen)
+	pkt.TransportProtocolNumber = header.TCPProtocolNumber
+	return ok
+}
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index 9339d637f..98bdd29db 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -16,6 +16,7 @@ package header
 
 import (
 	"encoding/binary"
+	"math"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
@@ -55,6 +56,10 @@ const (
 	// UDPMinimumSize is the minimum size of a valid UDP packet.
 	UDPMinimumSize = 8
 
+	// UDPMaximumSize is the maximum size of a valid UDP packet. The length field
+	// in the UDP header is 16 bits as per RFC 768.
+	UDPMaximumSize = math.MaxUint16
+
 	// UDPProtocolNumber is UDP's transport protocol number.
 	UDPProtocolNumber tcpip.TransportProtocolNumber = 17
 )
diff --git a/pkg/tcpip/link/ethernet/BUILD b/pkg/tcpip/link/ethernet/BUILD
new file mode 100644
index 000000000..ec92ed623
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ethernet",
+    srcs = ["ethernet.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/ethernet/ethernet.go b/pkg/tcpip/link/ethernet/ethernet.go
new file mode 100644
index 000000000..3eef7cd56
--- /dev/null
+++ b/pkg/tcpip/link/ethernet/ethernet.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ethernet provides an implementation of an ethernet link endpoint that
+// wraps an inner link endpoint.
+package ethernet
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+
+// New returns an ethernet link endpoint that wraps an inner link endpoint.
+func New(ep stack.LinkEndpoint) *Endpoint {
+	var e Endpoint
+	e.Endpoint.Init(ep, &e)
+	return &e
+}
+
+// Endpoint is an ethernet endpoint.
+//
+// It adds an ethernet header to packets before sending them out through its
+// inner link endpoint and consumes an ethernet header before sending the
+// packet to the stack.
+type Endpoint struct {
+	nested.Endpoint
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
+	if !ok {
+		return
+	}
+
+	eth := header.Ethernet(hdr)
+	if dst := eth.DestinationAddress(); dst == e.Endpoint.LinkAddress() || dst == header.EthernetBroadcastAddress || header.IsMulticastEthernetAddress(dst) {
+		e.Endpoint.DeliverNetworkPacket(eth.SourceAddress() /* remote */, dst /* local */, eth.Type() /* protocol */, pkt)
+	}
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return stack.CapabilityResolutionRequired | e.Endpoint.Capabilities()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.AddHeader(e.Endpoint.LinkAddress(), r.RemoteLinkAddress, proto, pkt)
+	return e.Endpoint.WritePacket(r, gso, proto, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	linkAddr := e.Endpoint.LinkAddress()
+
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.AddHeader(linkAddr, r.RemoteLinkAddress, proto, pkt)
+	}
+
+	return e.Endpoint.WritePackets(r, gso, pkts, proto)
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return header.EthernetMinimumSize + e.Endpoint.MaxHeaderLength()
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareEther
+}
+
+// AddHeader implements stack.LinkEndpoint.
+func (*Endpoint) AddHeader(local, remote tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
+	fields := header.EthernetFields{
+		SrcAddr: local,
+		DstAddr: remote,
+		Type:    proto,
+	}
+	eth.Encode(&fields)
+}
diff --git a/pkg/tcpip/link/pipe/BUILD b/pkg/tcpip/link/pipe/BUILD
new file mode 100644
index 000000000..9f31c1ffc
--- /dev/null
+++ b/pkg/tcpip/link/pipe/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pipe",
+    srcs = ["pipe.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/pipe/pipe.go b/pkg/tcpip/link/pipe/pipe.go
new file mode 100644
index 000000000..523b0d24b
--- /dev/null
+++ b/pkg/tcpip/link/pipe/pipe.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides the implementation of pipe-like data-link layer
+// endpoints. Such endpoints allow packets to be sent between two interfaces.
+package pipe
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+
+// New returns both ends of a new pipe.
+func New(linkAddr1, linkAddr2 tcpip.LinkAddress) (*Endpoint, *Endpoint) {
+	ep1 := &Endpoint{
+		linkAddr: linkAddr1,
+	}
+	ep2 := &Endpoint{
+		linkAddr: linkAddr2,
+	}
+	ep1.linked = ep2
+	ep2.linked = ep1
+	return ep1, ep2
+}
+
+// Endpoint is one end of a pipe.
+type Endpoint struct {
+	dispatcher stack.NetworkDispatcher
+	linked     *Endpoint
+	linkAddr   tcpip.LinkAddress
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, _ *stack.GSO, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if !e.linked.IsAttached() {
+		return nil
+	}
+
+	// Note that the local address from the perspective of this endpoint is the
+	// remote address from the perspective of the other end of the pipe
+	// (e.linked). Similarly, the remote address from the perspective of this
+	// endpoint is the local address on the other end.
+	e.linked.dispatcher.DeliverNetworkPacket(r.LocalLinkAddress /* remote */, r.RemoteLinkAddress /* local */, proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
+	}))
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (*Endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.
+func (*Endpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	panic("not implemented")
+}
+
+// Attach implements stack.LinkEndpoint.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// Wait implements stack.LinkEndpoint.
+func (*Endpoint) Wait() {}
+
+// MTU implements stack.LinkEndpoint.
+func (*Endpoint) MTU() uint32 {
+	return header.IPv6MinimumMTU
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (*Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (*Endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareNone
+}
+
+// AddHeader implements stack.LinkEndpoint.
+func (*Endpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 14b527bc2..6c410c5a6 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -18,3 +18,14 @@ go_library(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+go_test(
+    name = "rawfile_test",
+    srcs = [
+        "errors_test.go",
+    ],
+    library = "rawfile",
+    deps = [
+        "//pkg/tcpip",
+    ],
+)
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 99313ee25..5db4bf12b 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index a0a873c84..604868fd8 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -31,10 +31,12 @@ var translations [maxErrno]*tcpip.Error
 // *tcpip.Error.
 //
 // Valid, but unrecognized errnos will be translated to
-// tcpip.ErrInvalidEndpointState (EINVAL). Panics on invalid errnos.
+// tcpip.ErrInvalidEndpointState (EINVAL).
 func TranslateErrno(e syscall.Errno) *tcpip.Error {
-	if err := translations[e]; err != nil {
-		return err
+	if e > 0 && e < syscall.Errno(len(translations)) {
+		if err := translations[e]; err != nil {
+			return err
+		}
 	}
 	return tcpip.ErrInvalidEndpointState
 }
diff --git a/pkg/tcpip/link/rawfile/errors_test.go b/pkg/tcpip/link/rawfile/errors_test.go
new file mode 100644
index 000000000..e4cdc66bd
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/errors_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package rawfile
+
+import (
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestTranslateErrno(t *testing.T) {
+	for _, test := range []struct {
+		errno      syscall.Errno
+		translated *tcpip.Error
+	}{
+		{
+			errno:      syscall.Errno(0),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.Errno(maxErrno),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.Errno(514),
+			translated: tcpip.ErrInvalidEndpointState,
+		},
+		{
+			errno:      syscall.EEXIST,
+			translated: tcpip.ErrDuplicateAddress,
+		},
+	} {
+		got := TranslateErrno(test.errno)
+		if got != test.translated {
+			t.Errorf("TranslateErrno(%q) = %q, want %q", test.errno, got, test.translated)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index dc239a0d0..2777f1411 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -470,6 +470,7 @@ func TestConcurrentReaderWriter(t *testing.T) {
 
 	const count = 1000000
 	var wg sync.WaitGroup
+	defer wg.Wait()
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
@@ -489,30 +490,23 @@ func TestConcurrentReaderWriter(t *testing.T) {
 		}
 	}()
 
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		runtime.Gosched()
-		for i := 0; i < count; i++ {
-			n := 1 + rr.Intn(80)
-			rb := rx.Pull()
-			for rb == nil {
-				rb = rx.Pull()
-			}
+	for i := 0; i < count; i++ {
+		n := 1 + rr.Intn(80)
+		rb := rx.Pull()
+		for rb == nil {
+			rb = rx.Pull()
+		}
 
-			if n != len(rb) {
-				t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
-			}
+		if n != len(rb) {
+			t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
+		}
 
-			for j := range rb {
-				if v := byte(rr.Intn(256)); v != rb[j] {
-					t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
-				}
+		for j := range rb {
+			if v := byte(rr.Intn(256)); v != rb[j] {
+				t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
 			}
-
-			rx.Flush()
 		}
-	}()
 
-	wg.Wait()
+		rx.Flush()
+	}
 }
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 7cbc305e7..4aac12a8c 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -14,6 +14,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/link/nested",
         "//pkg/tcpip/stack",
     ],
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 4fb127978..560477926 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -195,49 +196,52 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	var transProto uint8
 	src := tcpip.Address("unknown")
 	dst := tcpip.Address("unknown")
-	id := 0
-	size := uint16(0)
+	var size uint16
+	var id uint32
 	var fragmentOffset uint16
 	var moreFragments bool
 
-	// Examine the packet using a new VV. Backing storage must not be written.
-	vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
-
+	// Clone the packet buffer to not modify the original.
+	//
+	// We don't clone the original packet buffer so that the new packet buffer
+	// does not have any of its headers set.
+	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views())})
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
-		if !ok {
+		if ok := parse.IPv4(pkt); !ok {
 			return
 		}
-		ipv4 := header.IPv4(hdr)
+
+		ipv4 := header.IPv4(pkt.NetworkHeader().View())
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		vv.TrimFront(int(ipv4.HeaderLength()))
-		id = int(ipv4.ID())
+		id = uint32(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt)
 		if !ok {
 			return
 		}
-		ipv6 := header.IPv6(hdr)
+
+		ipv6 := header.IPv6(pkt.NetworkHeader().View())
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
-		transProto = ipv6.NextHeader()
+		transProto = uint8(proto)
 		size = ipv6.PayloadLength()
-		vv.TrimFront(header.IPv6MinimumSize)
+		id = fragID
+		moreFragments = fragMore
+		fragmentOffset = fragOffset
 
 	case header.ARPProtocolNumber:
-		hdr, ok := vv.PullUp(header.ARPSize)
-		if !ok {
+		if parse.ARP(pkt) {
 			return
 		}
-		vv.TrimFront(header.ARPSize)
-		arp := header.ARP(hdr)
+
+		arp := header.ARP(pkt.NetworkHeader().View())
 		log.Infof(
 			"%s arp %s (%s) -> %s (%s) valid:%t",
 			prefix,
@@ -259,7 +263,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		hdr, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
 		if !ok {
 			break
 		}
@@ -296,7 +300,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
 		if !ok {
 			break
 		}
@@ -331,11 +335,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		hdr, ok := vv.PullUp(header.UDPMinimumSize)
-		if !ok {
+		if ok := parse.UDP(pkt); !ok {
 			break
 		}
-		udp := header.UDP(hdr)
+
+		udp := header.UDP(pkt.TransportHeader().View())
 		if fragmentOffset == 0 {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
@@ -345,19 +349,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		hdr, ok := vv.PullUp(header.TCPMinimumSize)
-		if !ok {
+		if ok := parse.TCP(pkt); !ok {
 			break
 		}
-		tcp := header.TCP(hdr)
+
+		tcp := header.TCP(pkt.TransportHeader().View())
 		if fragmentOffset == 0 {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
 				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
 				break
 			}
-			if offset > vv.Size() && !moreFragments {
-				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
+			if size := pkt.Data.Size() + len(tcp); offset > size && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size)
 				break
 			}
 
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 6c137f693..86f14db76 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,19 +1,34 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+go_template_instance(
+    name = "tun_endpoint_refs",
+    out = "tun_endpoint_refs.go",
+    package = "tun",
+    prefix = "tunEndpoint",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "tunEndpoint",
+    },
+)
+
 go_library(
     name = "tun",
     srcs = [
         "device.go",
         "protocol.go",
+        "tun_endpoint_refs.go",
         "tun_unsafe.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 3b1510a33..cda6328a2 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -77,13 +76,29 @@ func (d *Device) Release(ctx context.Context) {
 	}
 }
 
+// NICID returns the NIC ID of the device.
+//
+// Must only be called after the device has been attached to an endpoint.
+func (d *Device) NICID() tcpip.NICID {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+
+	if d.endpoint == nil {
+		panic("called NICID on a device that has not been attached")
+	}
+
+	return d.endpoint.nicID
+}
+
 // SetIff services TUNSETIFF ioctl(2) request.
-func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+//
+// Returns true if a new NIC was created; false if an existing one was attached.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) (bool, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
 	if d.endpoint != nil {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	// Input validations.
@@ -91,7 +106,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 	isTap := flags&linux.IFF_TAP != 0
 	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
 	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	prefix := "tun"
@@ -104,32 +119,32 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 		linkCaps |= stack.CapabilityResolutionRequired
 	}
 
-	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
+	endpoint, created, err := attachOrCreateNIC(s, name, prefix, linkCaps)
 	if err != nil {
-		return syserror.EINVAL
+		return false, syserror.EINVAL
 	}
 
 	d.endpoint = endpoint
 	d.notifyHandle = d.endpoint.AddNotify(d)
 	d.flags = flags
-	return nil
+	return created, nil
 }
 
-func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, bool, error) {
 	for {
 		// 1. Try to attach to an existing NIC.
 		if name != "" {
-			if nic, found := s.GetNICByName(name); found {
-				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+			if linkEP := s.GetLinkEndpointByName(name); linkEP != nil {
+				endpoint, ok := linkEP.(*tunEndpoint)
 				if !ok {
 					// Not a NIC created by tun device.
-					return nil, syserror.EOPNOTSUPP
+					return nil, false, syserror.EOPNOTSUPP
 				}
 				if !endpoint.TryIncRef() {
 					// Race detected: NIC got deleted in between.
 					continue
 				}
-				return endpoint, nil
+				return endpoint, false, nil
 			}
 		}
 
@@ -142,6 +157,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE
 			name:     name,
 			isTap:    prefix == "tap",
 		}
+		endpoint.EnableLeakCheck()
 		endpoint.Endpoint.LinkEPCapabilities = linkCaps
 		if endpoint.name == "" {
 			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
@@ -151,12 +167,12 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE
 		})
 		switch err {
 		case nil:
-			return endpoint, nil
+			return endpoint, true, nil
 		case tcpip.ErrDuplicateNICID:
 			// Race detected: A NIC has been created in between.
 			continue
 		default:
-			return nil, syserror.EINVAL
+			return nil, false, syserror.EINVAL
 		}
 	}
 }
@@ -331,19 +347,18 @@ func (d *Device) WriteNotify() {
 // It is ref-counted as multiple opening files can attach to the same NIC.
 // The last owner is responsible for deleting the NIC.
 type tunEndpoint struct {
+	tunEndpointRefs
 	*channel.Endpoint
 
-	refs.AtomicRefCount
-
 	stack *stack.Stack
 	nicID tcpip.NICID
 	name  string
 	isTap bool
 }
 
-// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+// DecRef decrements refcount of e, removing NIC if it reaches 0.
 func (e *tunEndpoint) DecRef(ctx context.Context) {
-	e.DecRefWithDestructor(ctx, func(context.Context) {
+	e.tunEndpointRefs.DecRef(func() {
 		e.stack.RemoveNIC(e.nicID)
 	})
 }
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 46083925c..c118a2929 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -9,14 +9,17 @@ go_test(
         "ip_test.go",
     ],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
     ],
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index eddf7b725..8a6bcfc2c 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/stack",
     ],
 )
@@ -28,5 +29,7 @@ go_test(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 920872c3f..a79379abb 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -15,20 +15,16 @@
 // Package arp implements the ARP network protocol. It is used to resolve
 // IPv4 addresses into link-local MAC addresses, and advertises IPv4
 // addresses of its stack with the local network.
-//
-// To use it in the networking stack, pass arp.NewProtocol() as one of the
-// network protocols when calling stack.New. Then add an "arp" address to every
-// NIC on the stack that should respond to ARP requests. That is:
-//
-//	if err := s.AddAddress(1, arp.ProtocolNumber, "arp"); err != nil {
-//		// handle err
-//	}
 package arp
 
 import (
+	"fmt"
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -40,12 +36,54 @@ const (
 	ProtocolAddress = tcpip.Address("arp")
 )
 
-// endpoint implements stack.NetworkEndpoint.
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	protocol      *protocol
-	nicID         tcpip.NICID
-	linkEP        stack.LinkEndpoint
+	stack.AddressableEndpointState
+
+	protocol *protocol
+
+	// enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	nic           stack.NetworkInterface
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
+}
+
+func (e *endpoint) Enable() *tcpip.Error {
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	e.setEnabled(true)
+	return nil
+}
+
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
+}
+
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the endpoint.
+func (e *endpoint) setEnabled(v bool) {
+	if v {
+		atomic.StoreUint32(&e.enabled, 1)
+	} else {
+		atomic.StoreUint32(&e.enabled, 0)
+	}
+}
+
+func (e *endpoint) Disable() {
+	e.setEnabled(false)
 }
 
 // DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint.
@@ -54,23 +92,17 @@ func (e *endpoint) DefaultTTL() uint8 {
 }
 
 func (e *endpoint) MTU() uint32 {
-	lmtu := e.linkEP.MTU()
+	lmtu := e.nic.MTU()
 	return lmtu - uint32(e.MaxHeaderLength())
 }
 
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
-}
-
 func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.ARPSize
+	return e.nic.MaxHeaderLength() + header.ARPSize
 }
 
-func (e *endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.AddressableEndpointState.Cleanup()
+}
 
 func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
@@ -78,7 +110,7 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderPara
 
 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return e.protocol.Number()
+	return ProtocolNumber
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
@@ -91,6 +123,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.ARP(pkt.NetworkHeader().View())
 	if !h.IsValid() {
 		return
@@ -99,30 +135,80 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	switch h.Op() {
 	case header.ARPRequest:
 		localAddr := tcpip.Address(h.ProtocolAddressTarget())
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
-			return // we have no useful answer, ignore the request
+
+		if e.nud == nil {
+			if e.linkAddrCache.CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			addr := tcpip.Address(h.ProtocolAddressSender())
+			linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr)
+		} else {
+			if r.Stack().CheckLocalAddress(e.nic.ID(), header.IPv4ProtocolNumber, localAddr) == 0 {
+				return // we have no useful answer, ignore the request
+			}
+
+			remoteAddr := tcpip.Address(h.ProtocolAddressSender())
+			remoteLinkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+			e.nud.HandleProbe(remoteAddr, ProtocolNumber, remoteLinkAddr, e.protocol)
 		}
-		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: int(e.linkEP.MaxHeaderLength()) + header.ARPSize,
+
+		respPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(e.nic.MaxHeaderLength()) + header.ARPSize,
 		})
-		packet := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
+		packet := header.ARP(respPkt.NetworkHeader().Push(header.ARPSize))
 		packet.SetIPv4OverEthernet()
 		packet.SetOp(header.ARPReply)
-		copy(packet.HardwareAddressSender(), r.LocalLinkAddress[:])
-		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
-		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
-		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		_ = e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
-		fallthrough // also fill the cache from requests
+		// TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a
+		// link address.
+		_ = copy(packet.HardwareAddressSender(), e.nic.LinkAddress())
+		if n := copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget()); n != header.IPv4AddressSize {
+			panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
+		}
+		origSender := h.HardwareAddressSender()
+		if n := copy(packet.HardwareAddressTarget(), origSender); n != header.EthernetAddressSize {
+			panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.EthernetAddressSize))
+		}
+		if n := copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender()); n != header.IPv4AddressSize {
+			panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
+		}
+
+		// As per RFC 826, under Packet Reception:
+		//   Swap hardware and protocol fields, putting the local hardware and
+		//   protocol addresses in the sender fields.
+		//
+		//   Send the packet to the (new) target hardware address on the same
+		//   hardware on which the request was received.
+		_ = e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), nil /* gso */, ProtocolNumber, respPkt)
+
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
 		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
-		e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+
+		if e.nud == nil {
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), addr, linkAddr)
+			return
+		}
+
+		// The solicited, override, and isRouter flags are not available for ARP;
+		// they are only available for IPv6 Neighbor Advertisements.
+		e.nud.HandleConfirmation(addr, linkAddr, stack.ReachabilityConfirmationFlags{
+			// Solicited and unsolicited (also referred to as gratuitous) ARP Replies
+			// are handled equivalently to a solicited Neighbor Advertisement.
+			Solicited: true,
+			// If a different link address is received than the one cached, the entry
+			// should always go to Stale.
+			Override: false,
+			// ARP does not distinguish between router and non-router hosts.
+			IsRouter: false,
+		})
 	}
 }
 
 // protocol implements stack.NetworkProtocol and stack.LinkAddressResolver.
 type protocol struct {
+	stack *stack.Stack
 }
 
 func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber }
@@ -134,13 +220,15 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
 }
 
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
 		protocol:      p,
-		nicID:         nicID,
-		linkEP:        sender,
+		nic:           nic,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
@@ -149,25 +237,44 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 }
 
 // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
-func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
-	r := &stack.Route{
-		RemoteLinkAddress: remoteLinkAddr,
+func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error {
+	if len(remoteLinkAddr) == 0 {
+		remoteLinkAddr = header.EthernetBroadcastAddress
 	}
-	if len(r.RemoteLinkAddress) == 0 {
-		r.RemoteLinkAddress = header.EthernetBroadcastAddress
+
+	nicID := nic.ID()
+	if len(localAddr) == 0 {
+		addr, err := p.stack.GetMainNICAddress(nicID, header.IPv4ProtocolNumber)
+		if err != nil {
+			return err
+		}
+
+		if len(addr.Address) == 0 {
+			return tcpip.ErrNetworkUnreachable
+		}
+
+		localAddr = addr.Address
+	} else if p.stack.CheckLocalAddress(nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
+		return tcpip.ErrBadLocalAddress
 	}
 
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.ARPSize,
+		ReserveHeaderBytes: int(nic.MaxHeaderLength()) + header.ARPSize,
 	})
 	h := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
+	pkt.NetworkProtocolNumber = ProtocolNumber
 	h.SetIPv4OverEthernet()
 	h.SetOp(header.ARPRequest)
-	copy(h.HardwareAddressSender(), linkEP.LinkAddress())
-	copy(h.ProtocolAddressSender(), localAddr)
-	copy(h.ProtocolAddressTarget(), addr)
-
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+	// TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a
+	// link address.
+	_ = copy(h.HardwareAddressSender(), nic.LinkAddress())
+	if n := copy(h.ProtocolAddressSender(), localAddr); n != header.IPv4AddressSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
+	}
+	if n := copy(h.ProtocolAddressTarget(), targetAddr); n != header.IPv4AddressSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
+	}
+	return nic.WritePacketToRemote(remoteLinkAddr, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
@@ -182,12 +289,12 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 }
 
 // SetOption implements stack.NetworkProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.NetworkProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -199,14 +306,14 @@ func (*protocol) Wait() {}
 
 // Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	_, ok = pkt.NetworkHeader().Consume(header.ARPSize)
-	if !ok {
-		return 0, false, false
-	}
-	return 0, false, true
+	return 0, false, parse.ARP(pkt)
 }
 
 // NewProtocol returns an ARP network protocol.
-func NewProtocol() stack.NetworkProtocol {
-	return &protocol{}
+//
+// Note, to make sure that the ARP endpoint receives ARP packets, the "arp"
+// address must be added to every NIC that should respond to ARP requests. See
+// ProtocolAddress for more details.
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
+	return &protocol{stack: s}
 }
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index c2c3e6891..bf1292bb8 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -16,10 +16,13 @@ package arp_test
 
 import (
 	"context"
+	"fmt"
 	"strconv"
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -32,57 +35,184 @@ import (
 )
 
 const (
-	stackLinkAddr1 = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
-	stackLinkAddr2 = tcpip.LinkAddress("\x0b\x0b\x0c\x0c\x0d\x0d")
-	stackAddr1     = tcpip.Address("\x0a\x00\x00\x01")
-	stackAddr2     = tcpip.Address("\x0a\x00\x00\x02")
-	stackAddrBad   = tcpip.Address("\x0a\x00\x00\x03")
+	nicID = 1
+
+	stackAddr     = tcpip.Address("\x0a\x00\x00\x01")
+	stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
+
+	remoteAddr     = tcpip.Address("\x0a\x00\x00\x02")
+	remoteLinkAddr = tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06")
+
+	unknownAddr = tcpip.Address("\x0a\x00\x00\x03")
 
 	defaultChannelSize = 1
 	defaultMTU         = 65536
+
+	// eventChanSize defines the size of event channels used by the neighbor
+	// cache's event dispatcher. The size chosen here needs to be sufficient to
+	// queue all the events received during tests before consumption.
+	// If eventChanSize is too small, the tests may deadlock.
+	eventChanSize = 32
+)
+
+type eventType uint8
+
+const (
+	entryAdded eventType = iota
+	entryChanged
+	entryRemoved
 )
 
+func (t eventType) String() string {
+	switch t {
+	case entryAdded:
+		return "add"
+	case entryChanged:
+		return "change"
+	case entryRemoved:
+		return "remove"
+	default:
+		return fmt.Sprintf("unknown (%d)", t)
+	}
+}
+
+type eventInfo struct {
+	eventType eventType
+	nicID     tcpip.NICID
+	entry     stack.NeighborEntry
+}
+
+func (e eventInfo) String() string {
+	return fmt.Sprintf("%s event for NIC #%d, %#v", e.eventType, e.nicID, e.entry)
+}
+
+// arpDispatcher implements NUDDispatcher to validate the dispatching of
+// events upon certain NUD state machine events.
+type arpDispatcher struct {
+	// C is where events are queued
+	C chan eventInfo
+}
+
+var _ stack.NUDDispatcher = (*arpDispatcher)(nil)
+
+func (d *arpDispatcher) OnNeighborAdded(nicID tcpip.NICID, entry stack.NeighborEntry) {
+	e := eventInfo{
+		eventType: entryAdded,
+		nicID:     nicID,
+		entry:     entry,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborChanged(nicID tcpip.NICID, entry stack.NeighborEntry) {
+	e := eventInfo{
+		eventType: entryChanged,
+		nicID:     nicID,
+		entry:     entry,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) OnNeighborRemoved(nicID tcpip.NICID, entry stack.NeighborEntry) {
+	e := eventInfo{
+		eventType: entryRemoved,
+		nicID:     nicID,
+		entry:     entry,
+	}
+	d.C <- e
+}
+
+func (d *arpDispatcher) waitForEvent(ctx context.Context, want eventInfo) error {
+	select {
+	case got := <-d.C:
+		if diff := cmp.Diff(got, want, cmp.AllowUnexported(got), cmpopts.IgnoreFields(stack.NeighborEntry{}, "UpdatedAt")); diff != "" {
+			return fmt.Errorf("got invalid event (-got +want):\n%s", diff)
+		}
+	case <-ctx.Done():
+		return fmt.Errorf("%s for %s", ctx.Err(), want)
+	}
+	return nil
+}
+
+func (d *arpDispatcher) waitForEventWithTimeout(want eventInfo, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return d.waitForEvent(ctx, want)
+}
+
+func (d *arpDispatcher) nextEvent() (eventInfo, bool) {
+	select {
+	case event := <-d.C:
+		return event, true
+	default:
+		return eventInfo{}, false
+	}
+}
+
 type testContext struct {
-	t      *testing.T
-	linkEP *channel.Endpoint
-	s      *stack.Stack
+	s       *stack.Stack
+	linkEP  *channel.Endpoint
+	nudDisp *arpDispatcher
 }
 
-func newTestContext(t *testing.T) *testContext {
+func newTestContext(t *testing.T, useNeighborCache bool) *testContext {
+	c := stack.DefaultNUDConfigurations()
+	// Transition from Reachable to Stale almost immediately to test if receiving
+	// probes refreshes positive reachability.
+	c.BaseReachableTime = time.Microsecond
+
+	d := arpDispatcher{
+		// Create an event channel large enough so the neighbor cache doesn't block
+		// while dispatching events. Blocking could interfere with the timing of
+		// NUD transitions.
+		C: make(chan eventInfo, eventChanSize),
+	}
+
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4},
+		NUDConfigs:         c,
+		NUDDisp:            &d,
+		UseNeighborCache:   useNeighborCache,
 	})
 
-	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
+	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
 	wep := stack.LinkEndpoint(ep)
 
 	if testing.Verbose() {
 		wep = sniffer.New(ep)
 	}
-	if err := s.CreateNIC(1, wep); err != nil {
+	if err := s.CreateNIC(nicID, wep); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil {
+	if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
 		t.Fatalf("AddAddress for ipv4 failed: %v", err)
 	}
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil {
-		t.Fatalf("AddAddress for ipv4 failed: %v", err)
+	if !useNeighborCache {
+		// The remote address needs to be assigned to the NIC so we can receive and
+		// verify outgoing ARP packets. The neighbor cache isn't concerned with
+		// this; the tests that use linkAddrCache expect the ARP responses to be
+		// received by the same NIC.
+		if err := s.AddAddress(nicID, ipv4.ProtocolNumber, remoteAddr); err != nil {
+			t.Fatalf("AddAddress for ipv4 failed: %v", err)
+		}
 	}
-	if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
 		t.Fatalf("AddAddress for arp failed: %v", err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{{
 		Destination: header.IPv4EmptySubnet,
-		NIC:         1,
+		NIC:         nicID,
 	}})
 
 	return &testContext{
-		t:      t,
-		s:      s,
-		linkEP: ep,
+		s:       s,
+		linkEP:  ep,
+		nudDisp: &d,
 	}
 }
 
@@ -91,7 +221,7 @@ func (c *testContext) cleanup() {
 }
 
 func TestDirectRequest(t *testing.T) {
-	c := newTestContext(t)
+	c := newTestContext(t, false /* useNeighborCache */)
 	defer c.cleanup()
 
 	const senderMAC = "\x01\x02\x03\x04\x05\x06"
@@ -111,7 +241,7 @@ func TestDirectRequest(t *testing.T) {
 		}))
 	}
 
-	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
+	for i, address := range []tcpip.Address{stackAddr, remoteAddr} {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 			inject(address)
 			pi, _ := c.linkEP.ReadContext(context.Background())
@@ -122,7 +252,7 @@ func TestDirectRequest(t *testing.T) {
 			if !rep.IsValid() {
 				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
 			}
-			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr1; got != want {
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
 				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
 			}
 			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
@@ -137,7 +267,7 @@ func TestDirectRequest(t *testing.T) {
 		})
 	}
 
-	inject(stackAddrBad)
+	inject(unknownAddr)
 	// Sleep tests are gross, but this will only potentially flake
 	// if there's a bug. If there is no bug this will reliably
 	// succeed.
@@ -148,43 +278,301 @@ func TestDirectRequest(t *testing.T) {
 	}
 }
 
+func TestDirectRequestWithNeighborCache(t *testing.T) {
+	c := newTestContext(t, true /* useNeighborCache */)
+	defer c.cleanup()
+
+	tests := []struct {
+		name           string
+		senderAddr     tcpip.Address
+		senderLinkAddr tcpip.LinkAddress
+		targetAddr     tcpip.Address
+		isValid        bool
+	}{
+		{
+			name:           "Loopback",
+			senderAddr:     stackAddr,
+			senderLinkAddr: stackLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "Remote",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     stackAddr,
+			isValid:        true,
+		},
+		{
+			name:           "RemoteInvalidTarget",
+			senderAddr:     remoteAddr,
+			senderLinkAddr: remoteLinkAddr,
+			targetAddr:     unknownAddr,
+			isValid:        false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Inject an incoming ARP request.
+			v := make(buffer.View, header.ARPSize)
+			h := header.ARP(v)
+			h.SetIPv4OverEthernet()
+			h.SetOp(header.ARPRequest)
+			copy(h.HardwareAddressSender(), test.senderLinkAddr)
+			copy(h.ProtocolAddressSender(), test.senderAddr)
+			copy(h.ProtocolAddressTarget(), test.targetAddr)
+			c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
+				Data: v.ToVectorisedView(),
+			})
+
+			if !test.isValid {
+				// No packets should be sent after receiving an invalid ARP request.
+				// There is no need to perform a blocking read here, since packets are
+				// sent in the same function that handles ARP requests.
+				if pkt, ok := c.linkEP.Read(); ok {
+					t.Errorf("unexpected packet sent with network protocol number %d", pkt.Proto)
+				}
+				return
+			}
+
+			// Verify an ARP response was sent.
+			pi, ok := c.linkEP.Read()
+			if !ok {
+				t.Fatal("expected ARP response to be sent, got none")
+			}
+
+			if pi.Proto != arp.ProtocolNumber {
+				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
+			}
+			rep := header.ARP(pi.Pkt.NetworkHeader().View())
+			if !rep.IsValid() {
+				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+				t.Errorf("got HardwareAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
+				t.Errorf("got ProtocolAddressSender() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want {
+				t.Errorf("got HardwareAddressTarget() = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want {
+				t.Errorf("got ProtocolAddressTarget() = %s, want = %s", got, want)
+			}
+
+			// Verify the sender was saved in the neighbor cache.
+			wantEvent := eventInfo{
+				eventType: entryAdded,
+				nicID:     nicID,
+				entry: stack.NeighborEntry{
+					Addr:     test.senderAddr,
+					LinkAddr: tcpip.LinkAddress(test.senderLinkAddr),
+					State:    stack.Stale,
+				},
+			}
+			if err := c.nudDisp.waitForEventWithTimeout(wantEvent, time.Second); err != nil {
+				t.Fatal(err)
+			}
+
+			neighbors, err := c.s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("c.s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("duplicate neighbor entry found (-existing +got):\n%s", diff)
+					}
+					t.Fatalf("exact neighbor entry duplicate found for addr=%s", n.Addr)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			neigh, ok := neighborByAddr[test.senderAddr]
+			if !ok {
+				t.Fatalf("expected neighbor entry with Addr = %s", test.senderAddr)
+			}
+			if got, want := neigh.LinkAddr, test.senderLinkAddr; got != want {
+				t.Errorf("got neighbor LinkAddr = %s, want = %s", got, want)
+			}
+			if got, want := neigh.State, stack.Stale; got != want {
+				t.Errorf("got neighbor State = %s, want = %s", got, want)
+			}
+
+			// No more events should be dispatched
+			for {
+				event, ok := c.nudDisp.nextEvent()
+				if !ok {
+					break
+				}
+				t.Errorf("unexpected %s", event)
+			}
+		})
+	}
+}
+
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	stack.LinkEndpoint
+
+	nicID tcpip.NICID
+}
+
+func (t *testInterface) ID() tcpip.NICID {
+	return t.nicID
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (*testInterface) Enabled() bool {
+	return true
+}
+
+func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	r := stack.Route{
+		NetProto:          protocol,
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+	return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt)
+}
+
 func TestLinkAddressRequest(t *testing.T) {
+	const nicID = 1
+
+	testAddr := tcpip.Address([]byte{1, 2, 3, 4})
+
 	tests := []struct {
 		name           string
+		nicAddr        tcpip.Address
+		localAddr      tcpip.Address
 		remoteLinkAddr tcpip.LinkAddress
-		expectLinkAddr tcpip.LinkAddress
+
+		expectedErr            *tcpip.Error
+		expectedLocalAddr      tcpip.Address
+		expectedRemoteLinkAddr tcpip.LinkAddress
 	}{
 		{
-			name:           "Unicast",
-			remoteLinkAddr: stackLinkAddr2,
-			expectLinkAddr: stackLinkAddr2,
+			name:                   "Unicast",
+			nicAddr:                stackAddr,
+			localAddr:              stackAddr,
+			remoteLinkAddr:         remoteLinkAddr,
+			expectedLocalAddr:      stackAddr,
+			expectedRemoteLinkAddr: remoteLinkAddr,
+		},
+		{
+			name:                   "Multicast",
+			nicAddr:                stackAddr,
+			localAddr:              stackAddr,
+			remoteLinkAddr:         "",
+			expectedLocalAddr:      stackAddr,
+			expectedRemoteLinkAddr: header.EthernetBroadcastAddress,
+		},
+		{
+			name:                   "Unicast with unspecified source",
+			nicAddr:                stackAddr,
+			remoteLinkAddr:         remoteLinkAddr,
+			expectedLocalAddr:      stackAddr,
+			expectedRemoteLinkAddr: remoteLinkAddr,
+		},
+		{
+			name:                   "Multicast with unspecified source",
+			nicAddr:                stackAddr,
+			remoteLinkAddr:         "",
+			expectedLocalAddr:      stackAddr,
+			expectedRemoteLinkAddr: header.EthernetBroadcastAddress,
+		},
+		{
+			name:           "Unicast with unassigned address",
+			localAddr:      testAddr,
+			remoteLinkAddr: remoteLinkAddr,
+			expectedErr:    tcpip.ErrBadLocalAddress,
 		},
 		{
-			name:           "Multicast",
+			name:           "Multicast with unassigned address",
+			localAddr:      testAddr,
 			remoteLinkAddr: "",
-			expectLinkAddr: header.EthernetBroadcastAddress,
+			expectedErr:    tcpip.ErrBadLocalAddress,
+		},
+		{
+			name:           "Unicast with no local address available",
+			remoteLinkAddr: remoteLinkAddr,
+			expectedErr:    tcpip.ErrNetworkUnreachable,
+		},
+		{
+			name:           "Multicast with no local address available",
+			remoteLinkAddr: "",
+			expectedErr:    tcpip.ErrNetworkUnreachable,
 		},
 	}
 
 	for _, test := range tests {
-		p := arp.NewProtocol()
-		linkRes, ok := p.(stack.LinkAddressResolver)
-		if !ok {
-			t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver")
-		}
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol},
+			})
+			p := s.NetworkProtocolInstance(arp.ProtocolNumber)
+			linkRes, ok := p.(stack.LinkAddressResolver)
+			if !ok {
+				t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver")
+			}
 
-		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
-		if err := linkRes.LinkAddressRequest(stackAddr1, stackAddr2, test.remoteLinkAddr, linkEP); err != nil {
-			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr1, stackAddr2, test.remoteLinkAddr, err)
-		}
+			linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr)
+			if err := s.CreateNIC(nicID, linkEP); err != nil {
+				t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+			}
 
-		pkt, ok := linkEP.Read()
-		if !ok {
-			t.Fatal("expected to send a link address request")
-		}
+			if len(test.nicAddr) != 0 {
+				if err := s.AddAddress(nicID, ipv4.ProtocolNumber, test.nicAddr); err != nil {
+					t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ipv4.ProtocolNumber, test.nicAddr, err)
+				}
+			}
 
-		if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want {
-			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want)
-		}
+			// We pass a test network interface to LinkAddressRequest with the same
+			// NIC ID and link endpoint used by the NIC we created earlier so that we
+			// can mock a link address request and observe the packets sent to the
+			// link endpoint even though the stack uses the real NIC to validate the
+			// local address.
+			if err := linkRes.LinkAddressRequest(remoteAddr, test.localAddr, test.remoteLinkAddr, &testInterface{LinkEndpoint: linkEP, nicID: nicID}); err != test.expectedErr {
+				t.Fatalf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", remoteAddr, test.localAddr, test.remoteLinkAddr, err, test.expectedErr)
+			}
+
+			if test.expectedErr != nil {
+				return
+			}
+
+			pkt, ok := linkEP.Read()
+			if !ok {
+				t.Fatal("expected to send a link address request")
+			}
+
+			if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr {
+				t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr)
+			}
+
+			rep := header.ARP(stack.PayloadSince(pkt.Pkt.NetworkHeader()))
+			if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr {
+				t.Errorf("got HardwareAddressSender = %s, want = %s", got, stackLinkAddr)
+			}
+			if got := tcpip.Address(rep.ProtocolAddressSender()); got != test.expectedLocalAddr {
+				t.Errorf("got ProtocolAddressSender = %s, want = %s", got, test.expectedLocalAddr)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"); got != want {
+				t.Errorf("got HardwareAddressTarget = %s, want = %s", got, want)
+			}
+			if got := tcpip.Address(rep.ProtocolAddressTarget()); got != remoteAddr {
+				t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, remoteAddr)
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index d1c728ccf..47fb63290 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -29,6 +29,8 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
     ],
 )
 
@@ -41,5 +43,10 @@ go_test(
         "reassembler_test.go",
     ],
     library = ":fragmentation",
-    deps = ["//pkg/tcpip/buffer"],
+    deps = [
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/faketime",
+        "//pkg/tcpip/network/testutil",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
 )
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 1827666c5..936601287 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // Package fragmentation contains the implementation of IP fragmentation.
-// It is based on RFC 791 and RFC 815.
+// It is based on RFC 791, RFC 815 and RFC 8200.
 package fragmentation
 
 import (
@@ -25,12 +25,10 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
-	// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time.
-	DefaultReassembleTimeout = 30 * time.Second
-
 	// HighFragThreshold is the threshold at which we start trimming old
 	// fragmented packets. Linux uses a default value of 4 MB. See
 	// net.ipv4.ipfrag_high_thresh for more information.
@@ -81,6 +79,8 @@ type Fragmentation struct {
 	size         int
 	timeout      time.Duration
 	blockSize    uint16
+	clock        tcpip.Clock
+	releaseJob   *tcpip.Job
 }
 
 // NewFragmentation creates a new Fragmentation.
@@ -97,7 +97,7 @@ type Fragmentation struct {
 // reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
 // Fragments are lazily evicted only when a new a packet with an
 // already existing fragmentation-id arrives after the timeout.
-func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation {
+func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock) *Fragmentation {
 	if lowMemoryLimit >= highMemoryLimit {
 		lowMemoryLimit = highMemoryLimit
 	}
@@ -110,69 +110,96 @@ func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, rea
 		blockSize = minBlockSize
 	}
 
-	return &Fragmentation{
+	f := &Fragmentation{
 		reassemblers: make(map[FragmentID]*reassembler),
 		highLimit:    highMemoryLimit,
 		lowLimit:     lowMemoryLimit,
 		timeout:      reassemblingTimeout,
 		blockSize:    blockSize,
+		clock:        clock,
 	}
+	f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked)
+
+	return f
 }
 
 // Process processes an incoming fragment belonging to an ID and returns a
-// complete packet when all the packets belonging to that ID have been received.
+// complete packet and its protocol number when all the packets belonging to
+// that ID have been received.
 //
 // [first, last] is the range of the fragment bytes.
 //
 // first must be a multiple of the block size f is configured with. The size
 // of the fragment data must be a multiple of the block size, unless there are
 // no fragments following this fragment (more set to false).
-func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+//
+// proto is the protocol number marked in the fragment being processed. It has
+// to be given here outside of the FragmentID struct because IPv6 should not use
+// the protocol to identify a fragment.
+//
+// releaseCB is a callback that will run when the fragment reassembly of a
+// packet is complete or cancelled. releaseCB take a a boolean argument which is
+// true iff the reassembly is cancelled due to timeout. releaseCB should be
+// passed only with the first fragment of a packet. If more than one releaseCB
+// are passed for the same packet, only the first releaseCB will be saved for
+// the packet and the succeeding ones will be dropped by running them
+// immediately with a false argument.
+func (f *Fragmentation) Process(
+	id FragmentID, first, last uint16, more bool, proto uint8, vv buffer.VectorisedView, releaseCB func(bool)) (
+	buffer.VectorisedView, uint8, bool, error) {
 	if first > last {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
 	}
 
 	if first%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
 	}
 
 	fragmentSize := last - first + 1
 	if more && fragmentSize%f.blockSize != 0 {
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
 	}
 
 	if l := vv.Size(); l < int(fragmentSize) {
-		return buffer.VectorisedView{}, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
 	}
 	vv.CapLength(int(fragmentSize))
 
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
-	if ok && r.tooOld(f.timeout) {
-		// This is very likely to be an id-collision or someone performing a slow-rate attack.
-		f.release(r)
-		ok = false
-	}
 	if !ok {
-		r = newReassembler(id)
+		r = newReassembler(id, f.clock)
 		f.reassemblers[id] = r
+		wasEmpty := f.rList.Empty()
 		f.rList.PushFront(r)
+		if wasEmpty {
+			// If we have just pushed a first reassembler into an empty list, we
+			// should kickstart the release job. The release job will keep
+			// rescheduling itself until the list becomes empty.
+			f.releaseReassemblersLocked()
+		}
+	}
+	if releaseCB != nil {
+		if !r.setCallback(releaseCB) {
+			// We got a duplicate callback. Release it immediately.
+			releaseCB(false /* timedOut */)
+		}
 	}
 	f.mu.Unlock()
 
-	res, done, consumed, err := r.process(first, last, more, vv)
+	res, firstFragmentProto, done, consumed, err := r.process(first, last, more, proto, vv)
 	if err != nil {
 		// We probably got an invalid sequence of fragments. Just
 		// discard the reassembler and move on.
 		f.mu.Lock()
-		f.release(r)
+		f.release(r, false /* timedOut */)
 		f.mu.Unlock()
-		return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+		return buffer.VectorisedView{}, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
 	}
 	f.mu.Lock()
 	f.size += consumed
 	if done {
-		f.release(r)
+		f.release(r, false /* timedOut */)
 	}
 	// Evict reassemblers if we are consuming more memory than highLimit until
 	// we reach lowLimit.
@@ -182,14 +209,14 @@ func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv
 			if tail == nil {
 				break
 			}
-			f.release(tail)
+			f.release(tail, false /* timedOut */)
 		}
 	}
 	f.mu.Unlock()
-	return res, done, nil
+	return res, firstFragmentProto, done, nil
 }
 
-func (f *Fragmentation) release(r *reassembler) {
+func (f *Fragmentation) release(r *reassembler, timedOut bool) {
 	// Before releasing a fragment we need to check if r is already marked as done.
 	// Otherwise, we would delete it twice.
 	if r.checkDoneOrMark() {
@@ -203,4 +230,105 @@ func (f *Fragmentation) release(r *reassembler) {
 		log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.size)
 		f.size = 0
 	}
+
+	r.release(timedOut) // releaseCB may run.
+}
+
+// releaseReassemblersLocked releases already-expired reassemblers, then
+// schedules the job to call back itself for the remaining reassemblers if
+// any. This function must be called with f.mu locked.
+func (f *Fragmentation) releaseReassemblersLocked() {
+	now := f.clock.NowMonotonic()
+	for {
+		// The reassembler at the end of the list is the oldest.
+		r := f.rList.Back()
+		if r == nil {
+			// The list is empty.
+			break
+		}
+		elapsed := time.Duration(now-r.creationTime) * time.Nanosecond
+		if f.timeout > elapsed {
+			// If the oldest reassembler has not expired, schedule the release
+			// job so that this function is called back when it has expired.
+			f.releaseJob.Schedule(f.timeout - elapsed)
+			break
+		}
+		// If the oldest reassembler has already expired, release it.
+		f.release(r, true /* timedOut*/)
+	}
+}
+
+// PacketFragmenter is the book-keeping struct for packet fragmentation.
+type PacketFragmenter struct {
+	transportHeader    buffer.View
+	data               buffer.VectorisedView
+	reserve            int
+	fragmentPayloadLen int
+	fragmentCount      int
+	currentFragment    int
+	fragmentOffset     int
+}
+
+// MakePacketFragmenter prepares the struct needed for packet fragmentation.
+//
+// pkt is the packet to be fragmented.
+//
+// fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can
+// have.
+//
+// reserve is the number of bytes that should be reserved for the headers in
+// each generated fragment.
+func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter {
+	// As per RFC 8200 Section 4.5, some IPv6 extension headers should not be
+	// repeated in each fragment. However we do not currently support any header
+	// of that kind yet, so the following computation is valid for both IPv4 and
+	// IPv6.
+	// TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are
+	// supported for outbound packets, the fragmentable data should not include
+	// these headers.
+	var fragmentableData buffer.VectorisedView
+	fragmentableData.AppendView(pkt.TransportHeader().View())
+	fragmentableData.Append(pkt.Data)
+	fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen
+
+	return PacketFragmenter{
+		data:               fragmentableData,
+		reserve:            reserve,
+		fragmentPayloadLen: int(fragmentPayloadLen),
+		fragmentCount:      int(fragmentCount),
+	}
+}
+
+// BuildNextFragment returns a packet with the payload of the next fragment,
+// along with the fragment's offset, the number of bytes copied and a boolean
+// indicating if there are more fragments left or not. If this function is
+// called again after it indicated that no more fragments were left, it will
+// panic.
+//
+// Note that the returned packet will not have its network and link headers
+// populated, but space for them will be reserved. The transport header will be
+// stored in the packet's data.
+func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) {
+	if pf.currentFragment >= pf.fragmentCount {
+		panic("BuildNextFragment should not be called again after the last fragment was returned")
+	}
+
+	fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: pf.reserve,
+	})
+
+	// Copy data for the fragment.
+	copied := pf.data.ReadToVV(&fragPkt.Data, pf.fragmentPayloadLen)
+
+	offset := pf.fragmentOffset
+	pf.fragmentOffset += copied
+	pf.currentFragment++
+	more := pf.currentFragment != pf.fragmentCount
+
+	return fragPkt, offset, copied, more
+}
+
+// RemainingFragmentCount returns the number of fragments left to be built.
+func (pf *PacketFragmenter) RemainingFragmentCount() int {
+	return pf.fragmentCount - pf.currentFragment
 }
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 9eedd33c4..5dcd10730 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -20,9 +20,16 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 )
 
+// reassembleTimeout is dummy timeout used for testing, where the clock never
+// advances.
+const reassembleTimeout = 1
+
 // vv is a helper to build VectorisedView from different strings.
 func vv(size int, pieces ...string) buffer.VectorisedView {
 	views := make([]buffer.View, len(pieces))
@@ -38,12 +45,14 @@ type processInput struct {
 	first uint16
 	last  uint16
 	more  bool
+	proto uint8
 	vv    buffer.VectorisedView
 }
 
 type processOutput struct {
-	vv   buffer.VectorisedView
-	done bool
+	vv    buffer.VectorisedView
+	proto uint8
+	done  bool
 }
 
 var processTestCases = []struct {
@@ -63,6 +72,17 @@ var processTestCases = []struct {
 		},
 	},
 	{
+		comment: "Next Header protocol mismatch",
+		in: []processInput{
+			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, proto: 6, vv: vv(2, "01")},
+			{id: FragmentID{ID: 0}, first: 2, last: 3, more: false, proto: 17, vv: vv(2, "23")},
+		},
+		out: []processOutput{
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: vv(4, "01", "23"), proto: 6, done: true},
+		},
+	},
+	{
 		comment: "Two IDs",
 		in: []processInput{
 			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")},
@@ -82,19 +102,27 @@ var processTestCases = []struct {
 func TestFragmentationProcess(t *testing.T) {
 	for _, c := range processTestCases {
 		t.Run(c.comment, func(t *testing.T) {
-			f := NewFragmentation(minBlockSize, 1024, 512, DefaultReassembleTimeout)
+			f := NewFragmentation(minBlockSize, 1024, 512, reassembleTimeout, &faketime.NullClock{})
+			firstFragmentProto := c.in[0].proto
 			for i, in := range c.in {
-				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				vv, proto, done, err := f.Process(in.id, in.first, in.last, in.more, in.proto, in.vv, nil)
 				if err != nil {
-					t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+					t.Fatalf("f.Process(%+v, %d, %d, %t, %d, %X) failed: %s",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), err)
 				}
 				if !reflect.DeepEqual(vv, c.out[i].vv) {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, %X) = (%X, _, _, _), want = (%X, _, _, _)",
+						in.id, in.first, in.last, in.more, in.proto, in.vv.ToView(), vv.ToView(), c.out[i].vv.ToView())
 				}
 				if done != c.out[i].done {
-					t.Errorf("got Process(%d) = %+v, want = %+v", i, done, c.out[i].done)
+					t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, _, %t, _), want = (_, _, %t, _)",
+						in.id, in.first, in.last, in.more, in.proto, done, c.out[i].done)
 				}
 				if c.out[i].done {
+					if firstFragmentProto != proto {
+						t.Errorf("got Process(%+v, %d, %d, %t, %d, _) = (_, %d, _, _), want = (_, %d, _, _)",
+							in.id, in.first, in.last, in.more, in.proto, proto, firstFragmentProto)
+					}
 					if _, ok := f.reassemblers[in.id]; ok {
 						t.Errorf("Process(%d) did not remove buffer from reassemblers", i)
 					}
@@ -110,35 +138,136 @@ func TestFragmentationProcess(t *testing.T) {
 }
 
 func TestReassemblingTimeout(t *testing.T) {
-	timeout := time.Millisecond
-	f := NewFragmentation(minBlockSize, 1024, 512, timeout)
-	// Send first fragment with id = 0, first = 0, last = 0, and more = true.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
-	// Sleep more than the timeout.
-	time.Sleep(2 * timeout)
-	// Send another fragment that completes a packet.
-	// However, no packet should be reassembled because the fragment arrived after the timeout.
-	_, done, err := f.Process(FragmentID{}, 1, 1, false, vv(1, "1"))
-	if err != nil {
-		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+	const (
+		reassemblyTimeout = time.Millisecond
+		protocol          = 0xff
+	)
+
+	type fragment struct {
+		first uint16
+		last  uint16
+		more  bool
+		data  string
+	}
+
+	type event struct {
+		// name is a nickname of this event.
+		name string
+
+		// clockAdvance is a duration to advance the clock. The clock advances
+		// before a fragment specified in the fragment field is processed.
+		clockAdvance time.Duration
+
+		// fragment is a fragment to process. This can be nil if there is no
+		// fragment to process.
+		fragment *fragment
+
+		// expectDone is true if the fragmentation instance should report the
+		// reassembly is done after the fragment is processd.
+		expectDone bool
+
+		// sizeAfterEvent is the expected size of the fragmentation instance after
+		// the event.
+		sizeAfterEvent int
+	}
+
+	half1 := &fragment{first: 0, last: 0, more: true, data: "0"}
+	half2 := &fragment{first: 1, last: 1, more: false, data: "1"}
+
+	tests := []struct {
+		name   string
+		events []event
+	}{
+		{
+			name: "half1 and half2 are reassembled successfully",
+			events: []event{
+				{
+					name:           "half1",
+					fragment:       half1,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2",
+					fragment:       half2,
+					expectDone:     true,
+					sizeAfterEvent: 0,
+				},
+			},
+		},
+		{
+			name: "half1 timeout, half2 timeout",
+			events: []event{
+				{
+					name:           "half1",
+					fragment:       half1,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half1 just before reassembly timeout",
+					clockAdvance:   reassemblyTimeout - 1,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half1 reassembly timeout",
+					clockAdvance:   1,
+					sizeAfterEvent: 0,
+				},
+				{
+					name:           "half2",
+					fragment:       half2,
+					expectDone:     false,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2 just before reassembly timeout",
+					clockAdvance:   reassemblyTimeout - 1,
+					sizeAfterEvent: 1,
+				},
+				{
+					name:           "half2 reassembly timeout",
+					clockAdvance:   1,
+					sizeAfterEvent: 0,
+				},
+			},
+		},
 	}
-	if done {
-		t.Errorf("Fragmentation does not respect the reassembling timeout.")
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			f := NewFragmentation(minBlockSize, HighFragThreshold, LowFragThreshold, reassemblyTimeout, clock)
+			for _, event := range test.events {
+				clock.Advance(event.clockAdvance)
+				if frag := event.fragment; frag != nil {
+					_, _, done, err := f.Process(FragmentID{}, frag.first, frag.last, frag.more, protocol, vv(len(frag.data), frag.data), nil)
+					if err != nil {
+						t.Fatalf("%s: f.Process failed: %s", event.name, err)
+					}
+					if done != event.expectDone {
+						t.Fatalf("%s: got done = %t, want = %t", event.name, done, event.expectDone)
+					}
+				}
+				if got, want := f.size, event.sizeAfterEvent; got != want {
+					t.Errorf("%s: got f.size = %d, want = %d", event.name, got, want)
+				}
+			}
+		})
 	}
 }
 
 func TestMemoryLimits(t *testing.T) {
-	f := NewFragmentation(minBlockSize, 3, 1, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 3, 1, reassembleTimeout, &faketime.NullClock{})
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{ID: 0}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{ID: 0}, 0, 0, true, 0xFF, vv(1, "0"), nil)
 	// Send first fragment with id = 1.
-	f.Process(FragmentID{ID: 1}, 0, 0, true, vv(1, "1"))
+	f.Process(FragmentID{ID: 1}, 0, 0, true, 0xFF, vv(1, "1"), nil)
 	// Send first fragment with id = 2.
-	f.Process(FragmentID{ID: 2}, 0, 0, true, vv(1, "2"))
+	f.Process(FragmentID{ID: 2}, 0, 0, true, 0xFF, vv(1, "2"), nil)
 
 	// Send first fragment with id = 3. This should caused id = 0 and id = 1 to be
 	// evicted.
-	f.Process(FragmentID{ID: 3}, 0, 0, true, vv(1, "3"))
+	f.Process(FragmentID{ID: 3}, 0, 0, true, 0xFF, vv(1, "3"), nil)
 
 	if _, ok := f.reassemblers[FragmentID{ID: 0}]; ok {
 		t.Errorf("Memory limits are not respected: id=0 has not been evicted.")
@@ -152,11 +281,11 @@ func TestMemoryLimits(t *testing.T) {
 }
 
 func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
-	f := NewFragmentation(minBlockSize, 1, 0, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 1, 0, reassembleTimeout, &faketime.NullClock{})
 	// Send first fragment with id = 0.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"), nil)
 	// Send the same packet again.
-	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, 0xFF, vv(1, "0"), nil)
 
 	got := f.size
 	want := 1
@@ -247,13 +376,209 @@ func TestErrors(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, DefaultReassembleTimeout)
-			_, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, vv(len(test.data), test.data))
+			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, reassembleTimeout, &faketime.NullClock{})
+			_, _, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, 0, vv(len(test.data), test.data), nil)
 			if !errors.Is(err, test.err) {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, _, %v), want = (_, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, _, %v), want = (_, _, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
 			}
 			if done {
-				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, true, _), want = (_, false, _)", test.first, test.last, test.more, test.data)
+				t.Errorf("got Process(_, %d, %d, %t, _, %q) = (_, _, true, _), want = (_, _, false, _)", test.first, test.last, test.more, test.data)
+			}
+		})
+	}
+}
+
+type fragmentInfo struct {
+	remaining int
+	copied    int
+	offset    int
+	more      bool
+}
+
+func TestPacketFragmenter(t *testing.T) {
+	const (
+		reserve = 60
+		proto   = 0
+	)
+
+	tests := []struct {
+		name               string
+		fragmentPayloadLen uint32
+		transportHeaderLen int
+		payloadSize        int
+		wantFragments      []fragmentInfo
+	}{
+		{
+			name:               "Packet exactly fits in MTU",
+			fragmentPayloadLen: 1280,
+			transportHeaderLen: 0,
+			payloadSize:        1280,
+			wantFragments: []fragmentInfo{
+				{remaining: 0, copied: 1280, offset: 0, more: false},
+			},
+		},
+		{
+			name:               "Packet exactly does not fit in MTU",
+			fragmentPayloadLen: 1000,
+			transportHeaderLen: 0,
+			payloadSize:        1001,
+			wantFragments: []fragmentInfo{
+				{remaining: 1, copied: 1000, offset: 0, more: true},
+				{remaining: 0, copied: 1, offset: 1000, more: false},
+			},
+		},
+		{
+			name:               "Packet has a transport header",
+			fragmentPayloadLen: 560,
+			transportHeaderLen: 40,
+			payloadSize:        560,
+			wantFragments: []fragmentInfo{
+				{remaining: 1, copied: 560, offset: 0, more: true},
+				{remaining: 0, copied: 40, offset: 560, more: false},
+			},
+		},
+		{
+			name:               "Packet has a huge transport header",
+			fragmentPayloadLen: 500,
+			transportHeaderLen: 1300,
+			payloadSize:        500,
+			wantFragments: []fragmentInfo{
+				{remaining: 3, copied: 500, offset: 0, more: true},
+				{remaining: 2, copied: 500, offset: 500, more: true},
+				{remaining: 1, copied: 500, offset: 1000, more: true},
+				{remaining: 0, copied: 300, offset: 1500, more: false},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(test.transportHeaderLen, reserve, []int{test.payloadSize}, proto)
+			var originalPayload buffer.VectorisedView
+			originalPayload.AppendView(pkt.TransportHeader().View())
+			originalPayload.Append(pkt.Data)
+			var reassembledPayload buffer.VectorisedView
+			pf := MakePacketFragmenter(pkt, test.fragmentPayloadLen, reserve)
+			for i := 0; ; i++ {
+				fragPkt, offset, copied, more := pf.BuildNextFragment()
+				wantFragment := test.wantFragments[i]
+				if got := pf.RemainingFragmentCount(); got != wantFragment.remaining {
+					t.Errorf("(fragment #%d) got pf.RemainingFragmentCount() = %d, want = %d", i, got, wantFragment.remaining)
+				}
+				if copied != wantFragment.copied {
+					t.Errorf("(fragment #%d) got copied = %d, want = %d", i, copied, wantFragment.copied)
+				}
+				if offset != wantFragment.offset {
+					t.Errorf("(fragment #%d) got offset = %d, want = %d", i, offset, wantFragment.offset)
+				}
+				if more != wantFragment.more {
+					t.Errorf("(fragment #%d) got more = %t, want = %t", i, more, wantFragment.more)
+				}
+				if got := uint32(fragPkt.Size()); got > test.fragmentPayloadLen {
+					t.Errorf("(fragment #%d) got fragPkt.Size() = %d, want <= %d", i, got, test.fragmentPayloadLen)
+				}
+				if got := fragPkt.AvailableHeaderBytes(); got != reserve {
+					t.Errorf("(fragment #%d) got fragPkt.AvailableHeaderBytes() = %d, want = %d", i, got, reserve)
+				}
+				if got := fragPkt.TransportHeader().View().Size(); got != 0 {
+					t.Errorf("(fragment #%d) got fragPkt.TransportHeader().View().Size() = %d, want = 0", i, got)
+				}
+				reassembledPayload.Append(fragPkt.Data)
+				if !more {
+					if i != len(test.wantFragments)-1 {
+						t.Errorf("got fragment count = %d, want = %d", i, len(test.wantFragments)-1)
+					}
+					break
+				}
+			}
+			if diff := cmp.Diff(reassembledPayload.ToView(), originalPayload.ToView()); diff != "" {
+				t.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestReleaseCallback(t *testing.T) {
+	const (
+		proto = 99
+	)
+
+	var result int
+	var callbackReasonIsTimeout bool
+	cb1 := func(timedOut bool) { result = 1; callbackReasonIsTimeout = timedOut }
+	cb2 := func(timedOut bool) { result = 2; callbackReasonIsTimeout = timedOut }
+
+	tests := []struct {
+		name                        string
+		callbacks                   []func(bool)
+		timeout                     bool
+		wantResult                  int
+		wantCallbackReasonIsTimeout bool
+	}{
+		{
+			name:                        "callback runs on release",
+			callbacks:                   []func(bool){cb1},
+			timeout:                     false,
+			wantResult:                  1,
+			wantCallbackReasonIsTimeout: false,
+		},
+		{
+			name:                        "first callback is nil",
+			callbacks:                   []func(bool){nil, cb2},
+			timeout:                     false,
+			wantResult:                  2,
+			wantCallbackReasonIsTimeout: false,
+		},
+		{
+			name:                        "two callbacks - first one is set",
+			callbacks:                   []func(bool){cb1, cb2},
+			timeout:                     false,
+			wantResult:                  1,
+			wantCallbackReasonIsTimeout: false,
+		},
+		{
+			name:                        "callback runs on timeout",
+			callbacks:                   []func(bool){cb1},
+			timeout:                     true,
+			wantResult:                  1,
+			wantCallbackReasonIsTimeout: true,
+		},
+		{
+			name:                        "no callbacks",
+			callbacks:                   []func(bool){nil},
+			timeout:                     false,
+			wantResult:                  0,
+			wantCallbackReasonIsTimeout: false,
+		},
+	}
+
+	id := FragmentID{ID: 0}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			result = 0
+			callbackReasonIsTimeout = false
+
+			f := NewFragmentation(minBlockSize, HighFragThreshold, LowFragThreshold, reassembleTimeout, &faketime.NullClock{})
+
+			for i, cb := range test.callbacks {
+				_, _, _, err := f.Process(id, uint16(i), uint16(i), true, proto, vv(1, "0"), cb)
+				if err != nil {
+					t.Errorf("f.Process error = %s", err)
+				}
+			}
+
+			r, ok := f.reassemblers[id]
+			if !ok {
+				t.Fatalf("Reassemberr not found")
+			}
+			f.release(r, test.timeout)
+
+			if result != test.wantResult {
+				t.Errorf("got result = %d, want = %d", result, test.wantResult)
+			}
+			if callbackReasonIsTimeout != test.wantCallbackReasonIsTimeout {
+				t.Errorf("got callbackReasonIsTimeout = %t, want = %t", callbackReasonIsTimeout, test.wantCallbackReasonIsTimeout)
 			}
 		})
 	}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 50d30bbf0..c0cc0bde0 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -18,9 +18,9 @@ import (
 	"container/heap"
 	"fmt"
 	"math"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
@@ -34,21 +34,22 @@ type reassembler struct {
 	reassemblerEntry
 	id           FragmentID
 	size         int
+	proto        uint8
 	mu           sync.Mutex
 	holes        []hole
 	deleted      int
 	heap         fragHeap
 	done         bool
-	creationTime time.Time
+	creationTime int64
+	callback     func(bool)
 }
 
-func newReassembler(id FragmentID) *reassembler {
+func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler {
 	r := &reassembler{
 		id:           id,
 		holes:        make([]hole, 0, 16),
-		deleted:      0,
 		heap:         make(fragHeap, 0, 8),
-		creationTime: time.Now(),
+		creationTime: clock.NowMonotonic(),
 	}
 	r.holes = append(r.holes, hole{
 		first:   0,
@@ -78,7 +79,7 @@ func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
 	return used
 }
 
-func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
+func (r *reassembler) process(first, last uint16, more bool, proto uint8, vv buffer.VectorisedView) (buffer.VectorisedView, uint8, bool, int, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	consumed := 0
@@ -86,7 +87,18 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 		// A concurrent goroutine might have already reassembled
 		// the packet and emptied the heap while this goroutine
 		// was waiting on the mutex. We don't have to do anything in this case.
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
+	}
+	// For IPv6, it is possible to have different Protocol values between
+	// fragments of a packet (because, unlike IPv4, the Protocol is not used to
+	// identify a fragment). In this case, only the Protocol of the first
+	// fragment must be used as per RFC 8200 Section 4.5.
+	//
+	// TODO(gvisor.dev/issue/3648): The entire first IP header should be recorded
+	// here (instead of just the protocol) because most IP options should be
+	// derived from the first fragment.
+	if first == 0 {
+		r.proto = proto
 	}
 	if r.updateHoles(first, last, more) {
 		// We store the incoming packet only if it filled some holes.
@@ -96,17 +108,13 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
 	}
 	// Check if all the holes have been deleted and we are ready to reassamble.
 	if r.deleted < len(r.holes) {
-		return buffer.VectorisedView{}, false, consumed, nil
+		return buffer.VectorisedView{}, 0, false, consumed, nil
 	}
 	res, err := r.heap.reassemble()
 	if err != nil {
-		return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
+		return buffer.VectorisedView{}, 0, false, consumed, fmt.Errorf("fragment reassembly failed: %w", err)
 	}
-	return res, true, consumed, nil
-}
-
-func (r *reassembler) tooOld(timeout time.Duration) bool {
-	return time.Now().Sub(r.creationTime) > timeout
+	return res, r.proto, true, consumed, nil
 }
 
 func (r *reassembler) checkDoneOrMark() bool {
@@ -116,3 +124,24 @@ func (r *reassembler) checkDoneOrMark() bool {
 	r.mu.Unlock()
 	return prev
 }
+
+func (r *reassembler) setCallback(c func(bool)) bool {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.callback != nil {
+		return false
+	}
+	r.callback = c
+	return true
+}
+
+func (r *reassembler) release(timedOut bool) {
+	r.mu.Lock()
+	callback := r.callback
+	r.callback = nil
+	r.mu.Unlock()
+
+	if callback != nil {
+		callback(timedOut)
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index dff7c9dcb..fa2a70dc8 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -18,6 +18,8 @@ import (
 	"math"
 	"reflect"
 	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 )
 
 type updateHolesInput struct {
@@ -94,7 +96,7 @@ var holesTestCases = []struct {
 
 func TestUpdateHoles(t *testing.T) {
 	for _, c := range holesTestCases {
-		r := newReassembler(FragmentID{})
+		r := newReassembler(FragmentID{}, &faketime.NullClock{})
 		for _, i := range c.in {
 			r.updateHoles(i.first, i.last, i.more)
 		}
@@ -103,3 +105,26 @@ func TestUpdateHoles(t *testing.T) {
 		}
 	}
 }
+
+func TestSetCallback(t *testing.T) {
+	result := 0
+	reasonTimeout := false
+
+	cb1 := func(timedOut bool) { result = 1; reasonTimeout = timedOut }
+	cb2 := func(timedOut bool) { result = 2; reasonTimeout = timedOut }
+
+	r := newReassembler(FragmentID{}, &faketime.NullClock{})
+	if !r.setCallback(cb1) {
+		t.Errorf("setCallback failed")
+	}
+	if r.setCallback(cb2) {
+		t.Errorf("setCallback should fail if one is already set")
+	}
+	r.release(true)
+	if result != 1 {
+		t.Errorf("got result = %d, want = 1", result)
+	}
+	if !reasonTimeout {
+		t.Errorf("got reasonTimeout = %t, want = true", reasonTimeout)
+	}
+}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 9007346fe..5cc60b98b 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -15,36 +15,48 @@
 package ip_test
 
 import (
+	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 )
 
 const (
-	localIpv4Addr      = "\x0a\x00\x00\x01"
-	localIpv4PrefixLen = 24
-	remoteIpv4Addr     = "\x0a\x00\x00\x02"
-	ipv4SubnetAddr     = "\x0a\x00\x00\x00"
-	ipv4SubnetMask     = "\xff\xff\xff\x00"
-	ipv4Gateway        = "\x0a\x00\x00\x03"
-	localIpv6Addr      = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	localIpv6PrefixLen = 120
-	remoteIpv6Addr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
-	ipv6SubnetAddr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-	ipv6SubnetMask     = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
-	ipv6Gateway        = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
-	nicID              = 1
+	localIPv4Addr  = "\x0a\x00\x00\x01"
+	remoteIPv4Addr = "\x0a\x00\x00\x02"
+	ipv4SubnetAddr = "\x0a\x00\x00\x00"
+	ipv4SubnetMask = "\xff\xff\xff\x00"
+	ipv4Gateway    = "\x0a\x00\x00\x03"
+	localIPv6Addr  = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	remoteIPv6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	ipv6SubnetAddr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	ipv6SubnetMask = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
+	ipv6Gateway    = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
+	nicID          = 1
 )
 
+var localIPv4AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   localIPv4Addr,
+	PrefixLen: 24,
+}
+
+var localIPv6AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   localIPv6Addr,
+	PrefixLen: 120,
+}
+
 // testObject implements two interfaces: LinkEndpoint and TransportDispatcher.
 // The former is used to pretend that it's a link endpoint so that we can
 // inspect packets written by the network endpoints. The latter is used to
@@ -98,9 +110,10 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) stack.TransportPacketDisposition {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
+	return stack.TransportPacketHandled
 }
 
 // DeliverTransportControlPacket is called by network endpoints after parsing
@@ -194,8 +207,8 @@ func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.Net
 
 func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	s.CreateNIC(nicID, loopback.New())
 	s.AddAddress(nicID, ipv4.ProtocolNumber, local)
@@ -210,8 +223,8 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 
 func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	s.CreateNIC(nicID, loopback.New())
 	s.AddAddress(nicID, ipv6.ProtocolNumber, local)
@@ -224,33 +237,294 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	return s.FindRoute(nicID, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
-func buildDummyStack(t *testing.T) *stack.Stack {
+func buildDummyStackWithLinkEndpoint(t *testing.T) (*stack.Stack, *channel.Endpoint) {
 	t.Helper()
 
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, tcp.NewProtocol},
 	})
 	e := channel.New(0, 1280, "")
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, localIpv4Addr); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, localIpv4Addr, err)
+	v4Addr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: localIPv4AddrWithPrefix}
+	if err := s.AddProtocolAddress(nicID, v4Addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v4Addr, err)
 	}
 
-	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, localIpv6Addr); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, localIpv6Addr, err)
+	v6Addr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: localIPv6AddrWithPrefix}
+	if err := s.AddProtocolAddress(nicID, v6Addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v) = %s", nicID, v6Addr, err)
 	}
 
+	return s, e
+}
+
+func buildDummyStack(t *testing.T) *stack.Stack {
+	t.Helper()
+
+	s, _ := buildDummyStackWithLinkEndpoint(t)
 	return s
 }
 
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	testObject
+
+	mu struct {
+		sync.RWMutex
+		disabled bool
+	}
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return nicID
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (t *testInterface) Enabled() bool {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+	return !t.mu.disabled
+}
+
+func (t *testInterface) setEnabled(v bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mu.disabled = !v
+}
+
+func (*testInterface) WritePacketToRemote(tcpip.LinkAddress, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func TestSourceAddressValidation(t *testing.T) {
+	rxIPv4ICMP := func(e *channel.Endpoint, src tcpip.Address) {
+		totalLen := header.IPv4MinimumSize + header.ICMPv4MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+		pkt.SetType(header.ICMPv4Echo)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(^header.Checksum(pkt, 0))
+		ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+		ip.Encode(&header.IPv4Fields{
+			IHL:         header.IPv4MinimumSize,
+			TotalLength: uint16(totalLen),
+			Protocol:    uint8(icmp.ProtocolNumber4),
+			TTL:         ipv4.DefaultTTL,
+			SrcAddr:     src,
+			DstAddr:     localIPv4Addr,
+		})
+		ip.SetChecksum(^ip.CalculateChecksum())
+
+		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	rxIPv6ICMP := func(e *channel.Endpoint, src tcpip.Address) {
+		totalLen := header.IPv6MinimumSize + header.ICMPv6MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+		pkt.SetType(header.ICMPv6EchoRequest)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, localIPv6Addr, buffer.VectorisedView{}))
+		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: header.ICMPv6MinimumSize,
+			NextHeader:    uint8(icmp.ProtocolNumber6),
+			HopLimit:      ipv6.DefaultTTL,
+			SrcAddr:       src,
+			DstAddr:       localIPv6Addr,
+		})
+		e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	tests := []struct {
+		name       string
+		srcAddress tcpip.Address
+		rxICMP     func(*channel.Endpoint, tcpip.Address)
+		valid      bool
+	}{
+		{
+			name:       "IPv4 valid",
+			srcAddress: "\x01\x02\x03\x04",
+			rxICMP:     rxIPv4ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv6 valid",
+			srcAddress: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10",
+			rxICMP:     rxIPv6ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv4 unspecified",
+			srcAddress: header.IPv4Any,
+			rxICMP:     rxIPv4ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv6 unspecified",
+			srcAddress: header.IPv4Any,
+			rxICMP:     rxIPv6ICMP,
+			valid:      true,
+		},
+		{
+			name:       "IPv4 multicast",
+			srcAddress: "\xe0\x00\x00\x01",
+			rxICMP:     rxIPv4ICMP,
+			valid:      false,
+		},
+		{
+			name:       "IPv6 multicast",
+			srcAddress: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			rxICMP:     rxIPv6ICMP,
+			valid:      false,
+		},
+		{
+			name:       "IPv4 broadcast",
+			srcAddress: header.IPv4Broadcast,
+			rxICMP:     rxIPv4ICMP,
+			valid:      false,
+		},
+		{
+			name: "IPv4 subnet broadcast",
+			srcAddress: func() tcpip.Address {
+				subnet := localIPv4AddrWithPrefix.Subnet()
+				return subnet.Broadcast()
+			}(),
+			rxICMP: rxIPv4ICMP,
+			valid:  false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s, e := buildDummyStackWithLinkEndpoint(t)
+			test.rxICMP(e, test.srcAddress)
+
+			var wantValid uint64
+			if test.valid {
+				wantValid = 1
+			}
+
+			if got, want := s.Stats().IP.InvalidSourceAddressesReceived.Value(), 1-wantValid; got != want {
+				t.Errorf("got s.Stats().IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
+			}
+			if got := s.Stats().IP.PacketsDelivered.Value(); got != wantValid {
+				t.Errorf("got s.Stats().IP.PacketsDelivered.Value() = %d, want = %d", got, wantValid)
+			}
+		})
+	}
+}
+
+func TestEnableWhenNICDisabled(t *testing.T) {
+	tests := []struct {
+		name            string
+		protocolFactory stack.NetworkProtocolFactory
+		protoNum        tcpip.NetworkProtocolNumber
+	}{
+		{
+			name:            "IPv4",
+			protocolFactory: ipv4.NewProtocol,
+			protoNum:        ipv4.ProtocolNumber,
+		},
+		{
+			name:            "IPv6",
+			protocolFactory: ipv6.NewProtocol,
+			protoNum:        ipv6.ProtocolNumber,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var nic testInterface
+			nic.setEnabled(false)
+
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{test.protocolFactory},
+			})
+			p := s.NetworkProtocolInstance(test.protoNum)
+
+			// We pass nil for all parameters except the NetworkInterface and Stack
+			// since Enable only depends on these.
+			ep := p.NewEndpoint(&nic, nil, nil, nil)
+
+			// The endpoint should initially be disabled, regardless the NIC's enabled
+			// status.
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+			nic.setEnabled(true)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Attempting to enable the endpoint while the NIC is disabled should
+			// fail.
+			nic.setEnabled(false)
+			if err := ep.Enable(); err != tcpip.ErrNotPermitted {
+				t.Fatalf("got ep.Enable() = %s, want = %s", err, tcpip.ErrNotPermitted)
+			}
+			// ep should consider the NIC's enabled status when determining its own
+			// enabled status so we "enable" the NIC to read just the endpoint's
+			// enabled status.
+			nic.setEnabled(true)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Enabling the interface after the NIC has been enabled should succeed.
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+			if !ep.Enabled() {
+				t.Fatal("got ep.Enabled() = false, want = true")
+			}
+
+			// ep should consider the NIC's enabled status when determining its own
+			// enabled status.
+			nic.setEnabled(false)
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+
+			// Disabling the endpoint when the NIC is enabled should make the endpoint
+			// disabled.
+			nic.setEnabled(true)
+			ep.Disable()
+			if ep.Enabled() {
+				t.Fatal("got ep.Enabled() = true, want = false")
+			}
+		})
+	}
+}
+
 func TestIPv4Send(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, nil, &o, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, nil)
 	defer ep.Close()
 
 	// Allocate and initialize the payload view.
@@ -266,12 +540,12 @@ func TestIPv4Send(t *testing.T) {
 	})
 
 	// Issue the write.
-	o.protocol = 123
-	o.srcAddr = localIpv4Addr
-	o.dstAddr = remoteIpv4Addr
-	o.contents = payload
+	nic.testObject.protocol = 123
+	nic.testObject.srcAddr = localIPv4Addr
+	nic.testObject.dstAddr = remoteIPv4Addr
+	nic.testObject.contents = payload
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -285,11 +559,21 @@ func TestIPv4Send(t *testing.T) {
 }
 
 func TestIPv4Receive(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv4MinimumSize + 30
 	view := buffer.NewView(totalLen)
 	ip := header.IPv4(view)
@@ -298,9 +582,10 @@ func TestIPv4Receive(t *testing.T) {
 		TotalLength: uint16(totalLen),
 		TTL:         20,
 		Protocol:    10,
-		SrcAddr:     remoteIpv4Addr,
-		DstAddr:     localIpv4Addr,
+		SrcAddr:     remoteIPv4Addr,
+		DstAddr:     localIPv4Addr,
 	})
+	ip.SetChecksum(^ip.CalculateChecksum())
 
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
@@ -308,12 +593,12 @@ func TestIPv4Receive(t *testing.T) {
 	}
 
 	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv4Addr
-	o.dstAddr = localIpv4Addr
-	o.contents = view[header.IPv4MinimumSize:totalLen]
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv4Addr
+	nic.testObject.dstAddr = localIPv4Addr
+	nic.testObject.contents = view[header.IPv4MinimumSize:totalLen]
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -324,8 +609,8 @@ func TestIPv4Receive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
@@ -349,17 +634,26 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		{"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0},
 		{"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4MinimumSize + 8},
 	}
-	r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb")
+	r, err := buildIPv4Route(localIPv4Addr, "\x0a\x00\x00\xbb")
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			o := testObject{t: t}
-			proto := ipv4.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			s := buildDummyStack(t)
+			proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+			nic := testInterface{
+				testObject: testObject{
+					t: t,
+				},
+			}
+			ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 			defer ep.Close()
 
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
 			const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize
 			view := buffer.NewView(dataOffset + 8)
 
@@ -371,8 +665,9 @@ func TestIPv4ReceiveControl(t *testing.T) {
 				TTL:         20,
 				Protocol:    uint8(header.ICMPv4ProtocolNumber),
 				SrcAddr:     "\x0a\x00\x00\xbb",
-				DstAddr:     localIpv4Addr,
+				DstAddr:     localIPv4Addr,
 			})
+			ip.SetChecksum(^ip.CalculateChecksum())
 
 			// Create the ICMP header.
 			icmp := header.ICMPv4(view[header.IPv4MinimumSize:])
@@ -389,9 +684,10 @@ func TestIPv4ReceiveControl(t *testing.T) {
 				TTL:            20,
 				Protocol:       10,
 				FragmentOffset: c.fragmentOffset,
-				SrcAddr:        localIpv4Addr,
-				DstAddr:        remoteIpv4Addr,
+				SrcAddr:        localIPv4Addr,
+				DstAddr:        remoteIPv4Addr,
 			})
+			ip.SetChecksum(^ip.CalculateChecksum())
 
 			// Make payload be non-zero.
 			for i := dataOffset; i < len(view); i++ {
@@ -400,27 +696,37 @@ func TestIPv4ReceiveControl(t *testing.T) {
 
 			// Give packet to IPv4 endpoint, dispatcher will validate that
 			// it's ok.
-			o.protocol = 10
-			o.srcAddr = remoteIpv4Addr
-			o.dstAddr = localIpv4Addr
-			o.contents = view[dataOffset:]
-			o.typ = c.expectedTyp
-			o.extra = c.expectedExtra
+			nic.testObject.protocol = 10
+			nic.testObject.srcAddr = remoteIPv4Addr
+			nic.testObject.dstAddr = localIPv4Addr
+			nic.testObject.contents = view[dataOffset:]
+			nic.testObject.typ = c.expectedTyp
+			nic.testObject.extra = c.expectedExtra
 
 			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
-			if want := c.expectedCount; o.controlCalls != want {
-				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			if want := c.expectedCount; nic.testObject.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, nic.testObject.controlCalls, want)
 			}
 		})
 	}
 }
 
 func TestIPv4FragmentationReceive(t *testing.T) {
-	o := testObject{t: t, v4: true}
-	proto := ipv4.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv4.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t:  t,
+			v4: true,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv4MinimumSize + 24
 
 	frag1 := buffer.NewView(totalLen)
@@ -432,9 +738,11 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		Protocol:       10,
 		FragmentOffset: 0,
 		Flags:          header.IPv4FlagMoreFragments,
-		SrcAddr:        remoteIpv4Addr,
-		DstAddr:        localIpv4Addr,
+		SrcAddr:        remoteIPv4Addr,
+		DstAddr:        localIPv4Addr,
 	})
+	ip1.SetChecksum(^ip1.CalculateChecksum())
+
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
 		frag1[i] = uint8(i)
@@ -448,21 +756,23 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		TTL:            20,
 		Protocol:       10,
 		FragmentOffset: 24,
-		SrcAddr:        remoteIpv4Addr,
-		DstAddr:        localIpv4Addr,
+		SrcAddr:        remoteIPv4Addr,
+		DstAddr:        localIPv4Addr,
 	})
+	ip2.SetChecksum(^ip2.CalculateChecksum())
+
 	// Make payload be non-zero.
 	for i := header.IPv4MinimumSize; i < totalLen; i++ {
 		frag2[i] = uint8(i)
 	}
 
 	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv4Addr
-	o.dstAddr = localIpv4Addr
-	o.contents = append(frag1[header.IPv4MinimumSize:totalLen], frag2[header.IPv4MinimumSize:totalLen]...)
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv4Addr
+	nic.testObject.dstAddr = localIPv4Addr
+	nic.testObject.contents = append(frag1[header.IPv4MinimumSize:totalLen], frag2[header.IPv4MinimumSize:totalLen]...)
 
-	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	r, err := buildIPv4Route(localIPv4Addr, remoteIPv4Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -475,8 +785,8 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 0 {
-		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
+	if nic.testObject.dataCalls != 0 {
+		t.Fatalf("Bad number of data calls: got %x, want 0", nic.testObject.dataCalls)
 	}
 
 	// Send second segment.
@@ -487,17 +797,26 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
 func TestIPv6Send(t *testing.T) {
-	o := testObject{t: t}
-	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, channel.New(0, 1280, ""), buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t: t,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, nil)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	// Allocate and initialize the payload view.
 	payload := buffer.NewView(100)
 	for i := 0; i < len(payload); i++ {
@@ -511,12 +830,12 @@ func TestIPv6Send(t *testing.T) {
 	})
 
 	// Issue the write.
-	o.protocol = 123
-	o.srcAddr = localIpv6Addr
-	o.dstAddr = remoteIpv6Addr
-	o.contents = payload
+	nic.testObject.protocol = 123
+	nic.testObject.srcAddr = localIPv6Addr
+	nic.testObject.dstAddr = remoteIPv6Addr
+	nic.testObject.contents = payload
 
-	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	r, err := buildIPv6Route(localIPv6Addr, remoteIPv6Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -530,11 +849,20 @@ func TestIPv6Send(t *testing.T) {
 }
 
 func TestIPv6Receive(t *testing.T) {
-	o := testObject{t: t}
-	proto := ipv6.NewProtocol()
-	ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+	s := buildDummyStack(t)
+	proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+	nic := testInterface{
+		testObject: testObject{
+			t: t,
+		},
+	}
+	ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 	defer ep.Close()
 
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
 	totalLen := header.IPv6MinimumSize + 30
 	view := buffer.NewView(totalLen)
 	ip := header.IPv6(view)
@@ -542,8 +870,8 @@ func TestIPv6Receive(t *testing.T) {
 		PayloadLength: uint16(totalLen - header.IPv6MinimumSize),
 		NextHeader:    10,
 		HopLimit:      20,
-		SrcAddr:       remoteIpv6Addr,
-		DstAddr:       localIpv6Addr,
+		SrcAddr:       remoteIPv6Addr,
+		DstAddr:       localIPv6Addr,
 	})
 
 	// Make payload be non-zero.
@@ -552,12 +880,12 @@ func TestIPv6Receive(t *testing.T) {
 	}
 
 	// Give packet to ipv6 endpoint, dispatcher will validate that it's ok.
-	o.protocol = 10
-	o.srcAddr = remoteIpv6Addr
-	o.dstAddr = localIpv6Addr
-	o.contents = view[header.IPv6MinimumSize:totalLen]
+	nic.testObject.protocol = 10
+	nic.testObject.srcAddr = remoteIPv6Addr
+	nic.testObject.dstAddr = localIPv6Addr
+	nic.testObject.contents = view[header.IPv6MinimumSize:totalLen]
 
-	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	r, err := buildIPv6Route(localIPv6Addr, remoteIPv6Addr)
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
@@ -569,8 +897,8 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
 	}
 	ep.HandlePacket(&r, pkt)
-	if o.dataCalls != 1 {
-		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	if nic.testObject.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", nic.testObject.dataCalls)
 	}
 }
 
@@ -601,7 +929,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		{"Zero-length packet", 0, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv6MinimumSize + header.ICMPv6DstUnreachableMinimumSize + 8},
 	}
 	r, err := buildIPv6Route(
-		localIpv6Addr,
+		localIPv6Addr,
 		"\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
 	)
 	if err != nil {
@@ -609,11 +937,20 @@ func TestIPv6ReceiveControl(t *testing.T) {
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			o := testObject{t: t}
-			proto := ipv6.NewProtocol()
-			ep := proto.NewEndpoint(nicID, nil, &o, nil, buildDummyStack(t))
+			s := buildDummyStack(t)
+			proto := s.NetworkProtocolInstance(ipv6.ProtocolNumber)
+			nic := testInterface{
+				testObject: testObject{
+					t: t,
+				},
+			}
+			ep := proto.NewEndpoint(&nic, nil, nil, &nic.testObject)
 			defer ep.Close()
 
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
 			dataOffset := header.IPv6MinimumSize*2 + header.ICMPv6MinimumSize
 			if c.fragmentOffset != nil {
 				dataOffset += header.IPv6FragmentHeaderSize
@@ -627,7 +964,7 @@ func TestIPv6ReceiveControl(t *testing.T) {
 				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 				HopLimit:      20,
 				SrcAddr:       outerSrcAddr,
-				DstAddr:       localIpv6Addr,
+				DstAddr:       localIPv6Addr,
 			})
 
 			// Create the ICMP header.
@@ -643,8 +980,8 @@ func TestIPv6ReceiveControl(t *testing.T) {
 				PayloadLength: 100,
 				NextHeader:    10,
 				HopLimit:      20,
-				SrcAddr:       localIpv6Addr,
-				DstAddr:       remoteIpv6Addr,
+				SrcAddr:       localIPv6Addr,
+				DstAddr:       remoteIPv6Addr,
 			})
 
 			// Build the fragmentation header if needed.
@@ -666,19 +1003,19 @@ func TestIPv6ReceiveControl(t *testing.T) {
 
 			// Give packet to IPv6 endpoint, dispatcher will validate that
 			// it's ok.
-			o.protocol = 10
-			o.srcAddr = remoteIpv6Addr
-			o.dstAddr = localIpv6Addr
-			o.contents = view[dataOffset:]
-			o.typ = c.expectedTyp
-			o.extra = c.expectedExtra
+			nic.testObject.protocol = 10
+			nic.testObject.srcAddr = remoteIPv6Addr
+			nic.testObject.dstAddr = localIPv6Addr
+			nic.testObject.contents = view[dataOffset:]
+			nic.testObject.typ = c.expectedTyp
+			nic.testObject.extra = c.expectedExtra
 
 			// Set ICMPv6 checksum.
-			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIPv6Addr, buffer.VectorisedView{}))
 
 			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
-			if want := c.expectedCount; o.controlCalls != want {
-				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			if want := c.expectedCount; nic.testObject.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, nic.testObject.controlCalls, want)
 			}
 		})
 	}
@@ -696,3 +1033,406 @@ func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer
 	_, _ = pkt.NetworkHeader().Consume(netHdrLen)
 	return pkt
 }
+
+func TestWriteHeaderIncludedPacket(t *testing.T) {
+	const (
+		nicID          = 1
+		transportProto = 5
+
+		dataLen    = 4
+		optionsLen = 4
+	)
+
+	dataBuf := [dataLen]byte{1, 2, 3, 4}
+	data := dataBuf[:]
+
+	ipv4OptionsBuf := [optionsLen]byte{0, 1, 0, 1}
+	ipv4Options := ipv4OptionsBuf[:]
+
+	ipv6FragmentExtHdrBuf := [header.IPv6FragmentExtHdrLength]byte{transportProto, 0, 62, 4, 1, 2, 3, 4}
+	ipv6FragmentExtHdr := ipv6FragmentExtHdrBuf[:]
+
+	var ipv6PayloadWithExtHdrBuf [dataLen + header.IPv6FragmentExtHdrLength]byte
+	ipv6PayloadWithExtHdr := ipv6PayloadWithExtHdrBuf[:]
+	if n := copy(ipv6PayloadWithExtHdr, ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+		t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+	}
+	if n := copy(ipv6PayloadWithExtHdr[header.IPv6FragmentExtHdrLength:], data); n != len(data) {
+		t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+	}
+
+	tests := []struct {
+		name         string
+		protoFactory stack.NetworkProtocolFactory
+		protoNum     tcpip.NetworkProtocolNumber
+		nicAddr      tcpip.Address
+		remoteAddr   tcpip.Address
+		pktGen       func(*testing.T, tcpip.Address) buffer.View
+		checker      func(*testing.T, *stack.PacketBuffer, tcpip.Address)
+		expectedErr  *tcpip.Error
+	}{
+		{
+			name:         "IPv4",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv4MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv4MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.IPFullLength(uint16(header.IPv4MinimumSize+len(data))),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv4 with IHL too small",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv4MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize - 1,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+		{
+			name:         "IPv4 too small",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return buffer.View(ip[:len(ip)-1])
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+		{
+			name:         "IPv4 minimum size",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv4(make([]byte, header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      header.IPv4MinimumSize,
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				return buffer.View(ip)
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv4MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv4MinimumSize)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.IPFullLength(header.IPv4MinimumSize),
+					checker.IPPayload(nil),
+				)
+			},
+		},
+		{
+			name:         "IPv4 with options",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+			nicAddr:      localIPv4Addr,
+			remoteAddr:   remoteIPv4Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ipHdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				totalLen := ipHdrLen + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv4(hdr.Prepend(ipHdrLen))
+				ip.Encode(&header.IPv4Fields{
+					IHL:      uint8(ipHdrLen),
+					Protocol: transportProto,
+					TTL:      ipv4.DefaultTTL,
+					SrcAddr:  src,
+					DstAddr:  header.IPv4Any,
+				})
+				if n := copy(ip.Options(), ipv4Options); n != len(ipv4Options) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv4Options))
+				}
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv4Any {
+					src = localIPv4Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				hdrLen := header.IPv4MinimumSize + len(ipv4Options)
+				if len(netHdr.View()) != hdrLen {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), hdrLen)
+				}
+
+				checker.IPv4(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv4Addr),
+					checker.IPv4HeaderLength(hdrLen),
+					checker.IPFullLength(uint16(hdrLen+len(data))),
+					checker.IPv4Options(ipv4Options),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv6",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv6MinimumSize + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv6MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(uint16(header.IPv6MinimumSize+len(data))),
+					checker.IPPayload(data),
+				)
+			},
+		},
+		{
+			name:         "IPv6 with extension header",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				totalLen := header.IPv6MinimumSize + len(ipv6FragmentExtHdr) + len(data)
+				hdr := buffer.NewPrependable(totalLen)
+				if n := copy(hdr.Prepend(len(data)), data); n != len(data) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(data))
+				}
+				if n := copy(hdr.Prepend(len(ipv6FragmentExtHdr)), ipv6FragmentExtHdr); n != len(ipv6FragmentExtHdr) {
+					t.Fatalf("copied %d bytes, expected %d bytes", n, len(ipv6FragmentExtHdr))
+				}
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: uint8(header.IPv6FragmentExtHdrIdentifier),
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return hdr.View()
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if want := header.IPv6MinimumSize + len(ipv6FragmentExtHdr); len(netHdr.View()) != want {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), want)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(uint16(header.IPv6MinimumSize+len(ipv6PayloadWithExtHdr))),
+					checker.IPPayload(ipv6PayloadWithExtHdr),
+				)
+			},
+		},
+		{
+			name:         "IPv6 minimum size",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return buffer.View(ip)
+			},
+			checker: func(t *testing.T, pkt *stack.PacketBuffer, src tcpip.Address) {
+				if src == header.IPv6Any {
+					src = localIPv6Addr
+				}
+
+				netHdr := pkt.NetworkHeader()
+
+				if len(netHdr.View()) != header.IPv6MinimumSize {
+					t.Errorf("got len(netHdr.View()) = %d, want = %d", len(netHdr.View()), header.IPv6MinimumSize)
+				}
+
+				checker.IPv6(t, stack.PayloadSince(netHdr),
+					checker.SrcAddr(src),
+					checker.DstAddr(remoteIPv6Addr),
+					checker.IPFullLength(header.IPv6MinimumSize),
+					checker.IPPayload(nil),
+				)
+			},
+		},
+		{
+			name:         "IPv6 too small",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+			nicAddr:      localIPv6Addr,
+			remoteAddr:   remoteIPv6Addr,
+			pktGen: func(t *testing.T, src tcpip.Address) buffer.View {
+				ip := header.IPv6(make([]byte, header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					NextHeader: transportProto,
+					HopLimit:   ipv6.DefaultTTL,
+					SrcAddr:    src,
+					DstAddr:    header.IPv4Any,
+				})
+				return buffer.View(ip[:len(ip)-1])
+			},
+			expectedErr: tcpip.ErrMalformedHeader,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			subTests := []struct {
+				name    string
+				srcAddr tcpip.Address
+			}{
+				{
+					name:    "unspecified source",
+					srcAddr: tcpip.Address(strings.Repeat("\x00", len(test.nicAddr))),
+				},
+				{
+					name:    "random source",
+					srcAddr: tcpip.Address(strings.Repeat("\xab", len(test.nicAddr))),
+				},
+			}
+
+			for _, subTest := range subTests {
+				t.Run(subTest.name, func(t *testing.T) {
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{test.protoFactory},
+					})
+					e := channel.New(1, 1280, "")
+					if err := s.CreateNIC(nicID, e); err != nil {
+						t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+					}
+					if err := s.AddAddress(nicID, test.protoNum, test.nicAddr); err != nil {
+						t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, test.protoNum, test.nicAddr, err)
+					}
+
+					s.SetRouteTable([]tcpip.Route{{Destination: test.remoteAddr.WithPrefix().Subnet(), NIC: nicID}})
+
+					r, err := s.FindRoute(nicID, test.nicAddr, test.remoteAddr, test.protoNum, false /* multicastLoop */)
+					if err != nil {
+						t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, test.remoteAddr, test.nicAddr, test.protoNum, err)
+					}
+					defer r.Release()
+
+					if err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: test.pktGen(t, subTest.srcAddr).ToVectorisedView(),
+					})); err != test.expectedErr {
+						t.Fatalf("got r.WriteHeaderIncludedPacket(_) = %s, want = %s", err, test.expectedErr)
+					}
+
+					if test.expectedErr != nil {
+						return
+					}
+
+					pkt, ok := e.Read()
+					if !ok {
+						t.Fatal("expected a packet to be written")
+					}
+					test.checker(t, pkt.Pkt, subTest.srcAddr)
+				})
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index d142b4ffa..6252614ec 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -10,9 +10,11 @@ go_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
@@ -26,11 +28,16 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/testutil",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index b5659a36b..1edb6de6b 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,6 +15,9 @@
 package ipv4
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -39,7 +42,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match an address we own.
 	src := hdr.SourceAddress()
-	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
+	if e.protocol.stack.CheckLocalAddress(e.nic.ID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -76,69 +79,87 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 		received.Echo.Increment()
 
 		// Only send a reply if the checksum is valid.
-		wantChecksum := h.Checksum()
-		// Reset the checksum field to 0 to can calculate the proper
-		// checksum. We'll have to reset this before we hand the packet
-		// off.
+		headerChecksum := h.Checksum()
 		h.SetChecksum(0)
-		gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
-		if gotChecksum != wantChecksum {
-			// It's possible that a raw socket expects to receive this.
-			h.SetChecksum(wantChecksum)
+		calculatedChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
+		h.SetChecksum(headerChecksum)
+		if calculatedChecksum != headerChecksum {
+			// It's possible that a raw socket still expects to receive this.
 			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 			received.Invalid.Increment()
 			return
 		}
 
-		// Make a copy of data before pkt gets sent to raw socket.
-		// DeliverTransportPacket will take ownership of pkt.
-		replyData := pkt.Data.Clone(nil)
-		replyData.TrimFront(header.ICMPv4MinimumSize)
+		// DeliverTransportPacket will take ownership of pkt so don't use it beyond
+		// this point. Make a deep copy of the data before pkt gets sent as we will
+		// be modifying fields.
+		//
+		// TODO(gvisor.dev/issue/4399): The copy may not be needed if there are no
+		// waiting endpoints. Consider moving responsibility for doing the copy to
+		// DeliverTransportPacket so that is is only done when needed.
+		replyData := pkt.Data.ToOwnedView()
+		replyIPHdr := header.IPv4(append(buffer.View(nil), pkt.NetworkHeader().View()...))
 
-		// It's possible that a raw socket expects to receive this.
-		h.SetChecksum(wantChecksum)
 		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 
-		remoteLinkAddr := r.RemoteLinkAddress
-
 		// As per RFC 1122 section 3.2.1.3, when a host sends any datagram, the IP
 		// source address MUST be one of its own IP addresses (but not a broadcast
 		// or multicast address).
 		localAddr := r.LocalAddress
-		if r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) {
+		if r.IsInboundBroadcast() || header.IsV4MulticastAddress(localAddr) {
 			localAddr = ""
 		}
 
-		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		r, err := r.Stack().FindRoute(e.nic.ID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
 		if err != nil {
 			// If we cannot find a route to the destination, silently drop the packet.
 			return
 		}
 		defer r.Release()
 
-		// Use the remote link address from the incoming packet.
-		r.ResolveWith(remoteLinkAddr)
-
-		// Prepare a reply packet.
-		icmpHdr := make(header.ICMPv4, header.ICMPv4MinimumSize)
-		copy(icmpHdr, h)
-		icmpHdr.SetType(header.ICMPv4EchoReply)
-		icmpHdr.SetChecksum(0)
-		icmpHdr.SetChecksum(^header.Checksum(icmpHdr, header.ChecksumVV(replyData, 0)))
-		dataVV := buffer.View(icmpHdr).ToVectorisedView()
-		dataVV.Append(replyData)
+		// TODO(gvisor.dev/issue/3810:) When adding protocol numbers into the
+		// header information, we may have to change this code to handle the
+		// ICMP header no longer being in the data buffer.
+
+		// Because IP and ICMP are so closely intertwined, we need to handcraft our
+		// IP header to be able to follow RFC 792. The wording on page 13 is as
+		// follows:
+		//   IP Fields:
+		//   Addresses
+		//     The address of the source in an echo message will be the
+		//     destination of the echo reply message.  To form an echo reply
+		//     message, the source and destination addresses are simply reversed,
+		//     the type code changed to 0, and the checksum recomputed.
+		//
+		// This was interpreted by early implementors to mean that all options must
+		// be copied from the echo request IP header to the echo reply IP header
+		// and this behaviour is still relied upon by some applications.
+		//
+		// Create a copy of the IP header we received, options and all, and change
+		// The fields we need to alter.
+		//
+		// We need to produce the entire packet in the data segment in order to
+		// use WriteHeaderIncludedPacket().
+		replyIPHdr.SetSourceAddress(r.LocalAddress)
+		replyIPHdr.SetDestinationAddress(r.RemoteAddress)
+		replyIPHdr.SetTTL(r.DefaultTTL())
+
+		replyICMPHdr := header.ICMPv4(replyData)
+		replyICMPHdr.SetType(header.ICMPv4EchoReply)
+		replyICMPHdr.SetChecksum(0)
+		replyICMPHdr.SetChecksum(^header.Checksum(replyData, 0))
+
+		replyVV := buffer.View(replyIPHdr).ToVectorisedView()
+		replyVV.AppendView(replyData)
 		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()),
-			Data:               dataVV,
+			Data:               replyVV,
 		})
+		replyPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
 
-		// Send out the reply packet.
+		// The checksum will be calculated so we don't need to do it here.
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
-			Protocol: header.ICMPv4ProtocolNumber,
-			TTL:      r.DefaultTTL(),
-			TOS:      stack.DefaultTOS,
-		}, replyPkt); err != nil {
+		if err := r.WriteHeaderIncludedPacket(replyPkt); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -161,8 +182,11 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 
 		case header.ICMPv4FragmentationNeeded:
-			mtu := uint32(h.MTU())
-			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
+			networkMTU, err := calculateNetworkMTU(uint32(h.MTU()), header.IPv4MinimumSize)
+			if err != nil {
+				networkMTU = 0
+			}
+			e.handleControl(stack.ControlPacketTooBig, networkMTU, pkt)
 		}
 
 	case header.ICMPv4SrcQuench:
@@ -193,3 +217,204 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 		received.Invalid.Increment()
 	}
 }
+
+// ======= ICMP Error packet generation =========
+
+// icmpReason is a marker interface for IPv4 specific ICMP errors.
+type icmpReason interface {
+	isICMPReason()
+}
+
+// icmpReasonPortUnreachable is an error where the transport protocol has no
+// listener and no alternative means to inform the sender.
+type icmpReasonPortUnreachable struct{}
+
+func (*icmpReasonPortUnreachable) isICMPReason() {}
+
+// icmpReasonProtoUnreachable is an error where the transport protocol is
+// not supported.
+type icmpReasonProtoUnreachable struct{}
+
+func (*icmpReasonProtoUnreachable) isICMPReason() {}
+
+// icmpReasonReassemblyTimeout is an error where insufficient fragments are
+// received to complete reassembly of a packet within a configured time after
+// the reception of the first-arriving fragment of that packet.
+type icmpReasonReassemblyTimeout struct{}
+
+func (*icmpReasonReassemblyTimeout) isICMPReason() {}
+
+// returnError takes an error descriptor and generates the appropriate ICMP
+// error packet for IPv4 and sends it back to the remote device that sent
+// the problematic packet. It incorporates as much of that packet as
+// possible as well as any error metadata as is available. returnError
+// expects pkt to hold a valid IPv4 packet as per the wire format.
+func (p *protocol) returnError(r *stack.Route, reason icmpReason, pkt *stack.PacketBuffer) *tcpip.Error {
+	// We check we are responding only when we are allowed to.
+	// See RFC 1812 section 4.3.2.7 (shown below).
+	//
+	// =========
+	// 4.3.2.7 When Not to Send ICMP Errors
+	//
+	//  An ICMP error message MUST NOT be sent as the result of receiving:
+	//
+	//  o An ICMP error message, or
+	//
+	//  o A packet which fails the IP header validation tests described in
+	//    Section [5.2.2] (except where that section specifically permits
+	//    the sending of an ICMP error message), or
+	//
+	//  o A packet destined to an IP broadcast or IP multicast address, or
+	//
+	//  o A packet sent as a Link Layer broadcast or multicast, or
+	//
+	//  o Any fragment of a datagram other then the first fragment (i.e., a
+	// packet for which the fragment offset in the IP header is nonzero).
+	//
+	// TODO(gvisor.dev/issues/4058): Make sure we don't send ICMP errors in
+	// response to a non-initial fragment, but it currently can not happen.
+
+	if r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) || r.RemoteAddress == header.IPv4Any {
+		return nil
+	}
+
+	// Even if we were able to receive a packet from some remote, we may not have
+	// a route to it - the remote may be blocked via routing rules. We must always
+	// consult our routing table and find a route to the remote before sending any
+	// packet.
+	route, err := p.stack.FindRoute(r.NICID(), r.LocalAddress, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer route.Release()
+	// From this point on, the incoming route should no longer be used; route
+	// must be used to send the ICMP error.
+	r = nil
+
+	sent := p.stack.Stats().ICMP.V4PacketsSent
+	if !p.stack.AllowICMPMessage() {
+		sent.RateLimited.Increment()
+		return nil
+	}
+
+	networkHeader := pkt.NetworkHeader().View()
+	transportHeader := pkt.TransportHeader().View()
+
+	// Don't respond to icmp error packets.
+	if header.IPv4(networkHeader).Protocol() == uint8(header.ICMPv4ProtocolNumber) {
+		// TODO(gvisor.dev/issue/3810):
+		// Unfortunately the current stack pretty much always has ICMPv4 headers
+		// in the Data section of the packet but there is no guarantee that is the
+		// case. If this is the case grab the header to make it like all other
+		// packet types. When this is cleaned up the Consume should be removed.
+		if transportHeader.IsEmpty() {
+			var ok bool
+			transportHeader, ok = pkt.TransportHeader().Consume(header.ICMPv4MinimumSize)
+			if !ok {
+				return nil
+			}
+		} else if transportHeader.Size() < header.ICMPv4MinimumSize {
+			return nil
+		}
+		// We need to decide to explicitly name the packets we can respond to or
+		// the ones we can not respond to. The decision is somewhat arbitrary and
+		// if problems arise this could be reversed. It was judged less of a breach
+		// of protocol to not respond to unknown non-error packets than to respond
+		// to unknown error packets so we take the first approach.
+		switch header.ICMPv4(transportHeader).Type() {
+		case
+			header.ICMPv4EchoReply,
+			header.ICMPv4Echo,
+			header.ICMPv4Timestamp,
+			header.ICMPv4TimestampReply,
+			header.ICMPv4InfoRequest,
+			header.ICMPv4InfoReply:
+		default:
+			// Assume any type we don't know about may be an error type.
+			return nil
+		}
+	}
+
+	// Now work out how much of the triggering packet we should return.
+	// As per RFC 1812 Section 4.3.2.3
+	//
+	//   ICMP datagram SHOULD contain as much of the original
+	//   datagram as possible without the length of the ICMP
+	//   datagram exceeding 576 bytes.
+	//
+	// NOTE: The above RFC referenced is different from the original
+	// recommendation in RFC 1122 and RFC 792 where it mentioned that at
+	// least 8 bytes of the payload must be included. Today linux and other
+	// systems implement the RFC 1812 definition and not the original
+	// requirement. We treat 8 bytes as the minimum but will try send more.
+	mtu := int(route.MTU())
+	if mtu > header.IPv4MinimumProcessableDatagramSize {
+		mtu = header.IPv4MinimumProcessableDatagramSize
+	}
+	headerLen := int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize
+	available := int(mtu) - headerLen
+
+	if available < header.IPv4MinimumSize+header.ICMPv4MinimumErrorPayloadSize {
+		return nil
+	}
+
+	payloadLen := networkHeader.Size() + transportHeader.Size() + pkt.Data.Size()
+	if payloadLen > available {
+		payloadLen = available
+	}
+
+	// The buffers used by pkt may be used elsewhere in the system.
+	// For example, an AF_RAW or AF_PACKET socket may use what the transport
+	// protocol considers an unreachable destination. Thus we deep copy pkt to
+	// prevent multiple ownership and SR errors. The new copy is a vectorized
+	// view with the entire incoming IP packet reassembled and truncated as
+	// required. This is now the payload of the new ICMP packet and no longer
+	// considered a packet in its own right.
+	newHeader := append(buffer.View(nil), networkHeader...)
+	newHeader = append(newHeader, transportHeader...)
+	payload := newHeader.ToVectorisedView()
+	payload.AppendView(pkt.Data.ToView())
+	payload.CapLength(payloadLen)
+
+	icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: headerLen,
+		Data:               payload,
+	})
+
+	icmpPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
+
+	icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
+	var counter *tcpip.StatCounter
+	switch reason.(type) {
+	case *icmpReasonPortUnreachable:
+		icmpHdr.SetType(header.ICMPv4DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv4PortUnreachable)
+		counter = sent.DstUnreachable
+	case *icmpReasonProtoUnreachable:
+		icmpHdr.SetType(header.ICMPv4DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv4ProtoUnreachable)
+		counter = sent.DstUnreachable
+	case *icmpReasonReassemblyTimeout:
+		icmpHdr.SetType(header.ICMPv4TimeExceeded)
+		icmpHdr.SetCode(header.ICMPv4ReassemblyTimeout)
+		counter = sent.TimeExceeded
+	default:
+		panic(fmt.Sprintf("unsupported ICMP type %T", reason))
+	}
+	icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data))
+
+	if err := route.WritePacket(
+		nil, /* gso */
+		stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      route.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		},
+		icmpPkt,
+	); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	counter.Increment()
+	return nil
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 79872ec9a..e6b842d61 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -12,26 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package ipv4 contains the implementation of the ipv4 network protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing ipv4.NewProtocol() as one of the network
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// ipv4.ProtocolNumber as the network protocol number when calling
-// Stack.NewEndpoint().
+// Package ipv4 contains the implementation of the ipv4 network protocol.
 package ipv4
 
 import (
+	"fmt"
 	"sync/atomic"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// As per RFC 791 section 3.2:
+	//   The current recommendation for the initial timer setting is 15 seconds.
+	//   This may be changed as experience with this protocol accumulates.
+	//
+	// Considering that it is an old recommendation, we use the same reassembly
+	// timeout that linux defines, which is 30 seconds:
+	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
+	ReassembleTimeout = 30 * time.Second
+
 	// ProtocolNumber is the ipv4 protocol number.
 	ProtocolNumber = header.IPv4ProtocolNumber
 
@@ -50,158 +58,140 @@ const (
 	fragmentblockSize = 8
 )
 
+var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()
+
+var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	nicID      tcpip.NICID
-	linkEP     stack.LinkEndpoint
+	nic        stack.NetworkInterface
 	dispatcher stack.TransportDispatcher
 	protocol   *protocol
-	stack      *stack.Stack
+
+	// enabled is set to 1 when the enpoint is enabled and 0 when it is
+	// disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	mu struct {
+		sync.RWMutex
+
+		addressableEndpointState stack.AddressableEndpointState
+	}
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
-		nicID:      nicID,
-		linkEP:     linkEP,
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
+		nic:        nic,
 		dispatcher: dispatcher,
 		protocol:   p,
-		stack:      st,
 	}
+	e.mu.addressableEndpointState.Init(e)
+	return e
 }
 
-// DefaultTTL is the default time-to-live value for this endpoint.
-func (e *endpoint) DefaultTTL() uint8 {
-	return e.protocol.DefaultTTL()
-}
+// Enable implements stack.NetworkEndpoint.
+func (e *endpoint) Enable() *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
 
-// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
-// the network layer max header length.
-func (e *endpoint) MTU() uint32 {
-	return calculateMTU(e.linkEP.MTU())
-}
+	// If the NIC is not enabled, the endpoint can't do anything meaningful so
+	// don't enable the endpoint.
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	// If the endpoint is already enabled, there is nothing for it to do.
+	if !e.setEnabled(true) {
+		return nil
+	}
+
+	// Create an endpoint to receive broadcast packets on this interface.
+	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
+	if err != nil {
+		return err
+	}
+	// We have no need for the address endpoint.
+	ep.DecRef()
 
-// Capabilities implements stack.NetworkEndpoint.Capabilities.
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
+	// multicast group. Note, the IANA calls the all-hosts multicast group the
+	// all-systems multicast group.
+	_, err = e.mu.addressableEndpointState.JoinGroup(header.IPv4AllSystems)
+	return err
 }
 
-// NICID returns the ID of the NIC this endpoint belongs to.
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
+// Enabled implements stack.NetworkEndpoint.
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
 }
 
-// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
-// underlying protocols).
-func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
 }
 
-// GSOMaxSize returns the maximum GSO packet size.
-func (e *endpoint) GSOMaxSize() uint32 {
-	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
-		return gso.GSOMaxSize()
+// setEnabled sets the enabled status for the endpoint.
+//
+// Returns true if the enabled status was updated.
+func (e *endpoint) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&e.enabled, 1) == 0
 	}
-	return 0
+	return atomic.SwapUint32(&e.enabled, 0) == 1
 }
 
-// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
-func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return e.protocol.Number()
+// Disable implements stack.NetworkEndpoint.
+func (e *endpoint) Disable() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.disableLocked()
 }
 
-// writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
-// write. It assumes that the IP header is already present in pkt.NetworkHeader.
-// pkt.TransportHeader may be set. mtu includes the IP header and options. This
-// does not support the DontFragment IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
-	// This packet is too big, it needs to be fragmented.
-	ip := header.IPv4(pkt.NetworkHeader().View())
-	flags := ip.Flags()
-
-	// Update mtu to take into account the header, which will exist in all
-	// fragments anyway.
-	innerMTU := mtu - int(ip.HeaderLength())
-
-	// Round the MTU down to align to 8 bytes. Then calculate the number of
-	// fragments. Calculate fragment sizes as in RFC791.
-	innerMTU &^= 7
-	n := (int(ip.PayloadLength()) + innerMTU - 1) / innerMTU
-
-	outerMTU := innerMTU + int(ip.HeaderLength())
-	offset := ip.FragmentOffset()
-
-	// Keep the length reserved for link-layer, we need to create fragments with
-	// the same reserved length.
-	reservedForLink := pkt.AvailableHeaderBytes()
-
-	// Destroy the packet, pull all payloads out for fragmentation.
-	transHeader, data := pkt.TransportHeader().View(), pkt.Data
-
-	// Where possible, the first fragment that is sent has the same
-	// number of bytes reserved for header as the input packet. The link-layer
-	// endpoint may depend on this for looking at, eg, L4 headers.
-	transFitsFirst := len(transHeader) <= innerMTU
-
-	for i := 0; i < n; i++ {
-		reserve := reservedForLink + int(ip.HeaderLength())
-		if i == 0 && transFitsFirst {
-			// Reserve for transport header if it's going to be put in the first
-			// fragment.
-			reserve += len(transHeader)
-		}
-		fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: reserve,
-		})
-		fragPkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-
-		// Copy data for the fragment.
-		avail := innerMTU
+func (e *endpoint) disableLocked() {
+	if !e.setEnabled(false) {
+		return
+	}
 
-		if n := len(transHeader); n > 0 {
-			if n > avail {
-				n = avail
-			}
-			if i == 0 && transFitsFirst {
-				copy(fragPkt.TransportHeader().Push(n), transHeader)
-			} else {
-				fragPkt.Data.AppendView(transHeader[:n:n])
-			}
-			transHeader = transHeader[n:]
-			avail -= n
-		}
+	// The endpoint may have already left the multicast group.
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv4AllSystems); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
+	}
 
-		if avail > 0 {
-			n := data.Size()
-			if n > avail {
-				n = avail
-			}
-			data.ReadToVV(&fragPkt.Data, n)
-			avail -= n
-		}
+	// The address may have already been removed.
+	if err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
+	}
+}
 
-		copied := uint16(innerMTU - avail)
-
-		// Set lengths in header and calculate checksum.
-		h := header.IPv4(fragPkt.NetworkHeader().Push(len(ip)))
-		copy(h, ip)
-		if i != n-1 {
-			h.SetTotalLength(uint16(outerMTU))
-			h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset)
-		} else {
-			h.SetTotalLength(uint16(h.HeaderLength()) + copied)
-			h.SetFlagsFragmentOffset(flags, offset)
-		}
-		h.SetChecksum(0)
-		h.SetChecksum(^h.CalculateChecksum())
-		offset += copied
+// DefaultTTL is the default time-to-live value for this endpoint.
+func (e *endpoint) DefaultTTL() uint8 {
+	return e.protocol.DefaultTTL()
+}
 
-		// Send out the fragment.
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, fragPkt); err != nil {
-			return err
-		}
-		r.Stats().IP.PacketsSent.Increment()
+// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
+// the network layer max header length.
+func (e *endpoint) MTU() uint32 {
+	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
+	if err != nil {
+		return 0
 	}
-	return nil
+	return networkMTU
+}
+
+// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
+// underlying protocols).
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
+}
+
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
 }
 
 func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
@@ -222,30 +212,59 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	pkt.NetworkProtocolNumber = ProtocolNumber
+}
+
+// handleFragments fragments pkt and calls the handler function on each
+// fragment. It returns the number of fragments handled and the number of
+// fragments left to be processed. The IP header must already be present in the
+// original packet.
+func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) *tcpip.Error) (int, int, *tcpip.Error) {
+	// Round the MTU down to align to 8 bytes.
+	fragmentPayloadSize := networkMTU &^ 7
+	networkHeader := header.IPv4(pkt.NetworkHeader().View())
+	pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))
+
+	var n int
+	for {
+		fragPkt, more := buildNextFragment(&pf, networkHeader)
+		if err := handler(fragPkt); err != nil {
+			return n, pf.RemainingFragmentCount() + 1, err
+		}
+		n++
+		if !more {
+			return n, pf.RemainingFragmentCount(), nil
+		}
+	}
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.addIPHeader(r, pkt, params)
+	return e.writePacket(r, gso, pkt)
+}
 
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer) *tcpip.Error {
 	// iptables filtering. All packets that reach here are locally
 	// generated.
-	nicName := e.stack.FindNICNameFromID(e.NICID())
-	ipt := e.stack.IPTables()
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
 	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
 		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesOutputDropped.Increment()
 		return nil
 	}
 
-	// If the packet is manipulated as per NAT Ouput rules, handle packet
-	// based on destination address and do not send the packet to link layer.
-	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
-	// only NATted packets, but removing this check short circuits broadcasts
-	// before they are sent out to other hosts.
+	// If the packet is manipulated as per NAT Output rules, handle packet
+	// based on destination address and do not send the packet to link
+	// layer.
+	//
+	// TODO(gvisor.dev/issue/170): We should do this for every
+	// packet, rather than only NATted packets, but removing this check
+	// short circuits broadcasts before they are sent out to other hosts.
 	if pkt.NatDone {
 		netHeader := header.IPv4(pkt.NetworkHeader().View())
-		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
+		ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress())
 		if err == nil {
 			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
 			ep.HandlePacket(&route, pkt)
@@ -261,10 +280,28 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
-	if pkt.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
-		return e.writePacketFragments(r, gso, int(e.linkEP.MTU()), pkt)
+
+	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return err
 	}
-	if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+
+	if packetMustBeFragmented(pkt, networkMTU, gso) {
+		sent, remain, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
+			// fragment one by one using WritePacket() (current strategy) or if we
+			// want to create a PacketBufferList from the fragments and feed it to
+			// WritePackets(). It'll be faster but cost more memory.
+			return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
+		})
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(sent))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(remain))
+		return err
+	}
+
+	if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.Stats().IP.PacketsSent.Increment()
@@ -280,25 +317,49 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		return pkts.Len(), nil
 	}
 
-	for pkt := pkts.Front(); pkt != nil; {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		e.addIPHeader(r, pkt, params)
-		pkt = pkt.Next()
+		networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
+			return 0, err
+		}
+
+		if packetMustBeFragmented(pkt, networkMTU, gso) {
+			// Keep track of the packet that is about to be fragmented so it can be
+			// removed once the fragmentation is done.
+			originalPkt := pkt
+			if _, _, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+				// Modify the packet list in place with the new fragments.
+				pkts.InsertAfter(pkt, fragPkt)
+				pkt = fragPkt
+				return nil
+			}); err != nil {
+				panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err))
+			}
+			// Remove the packet that was just fragmented and process the rest.
+			pkts.Remove(originalPkt)
+		}
 	}
 
-	nicName := e.stack.FindNICNameFromID(e.NICID())
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 	// iptables filtering. All packets that reach here are locally
 	// generated.
-	ipt := e.stack.IPTables()
+	ipt := e.protocol.stack.IPTables()
 	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
 	if len(dropped) == 0 && len(natPkts) == 0 {
 		// Fast path: If no packets are to be dropped then we can just invoke the
 		// faster WritePackets API directly.
-		n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+		n, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
 		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		}
 		return n, err
 	}
+	r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
 
-	// Slow Path as we are dropping some packets in the batch degrade to
+	// Slow path as we are dropping some packets in the batch degrade to
 	// emitting one packet at a time.
 	n := 0
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
@@ -307,7 +368,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		}
 		if _, ok := natPkts[pkt]; ok {
 			netHeader := header.IPv4(pkt.NetworkHeader().View())
-			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+			if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
 				src := netHeader.SourceAddress()
 				dst := netHeader.DestinationAddress()
 				route := r.ReverseRoute(src, dst)
@@ -316,40 +377,41 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 				continue
 			}
 		}
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-			return n, err
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n - len(dropped)))
+			// Dropped packets aren't errors, so include them in
+			// the return value.
+			return n + len(dropped), err
 		}
 		n++
 	}
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, nil
+	// Dropped packets aren't errors, so include them in the return value.
+	return n + len(dropped), nil
 }
 
-// WriteHeaderIncludedPacket writes a packet already containing a network
-// header through the given route.
+// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
 	if !ok {
-		return tcpip.ErrInvalidOptionValue
+		return tcpip.ErrMalformedHeader
 	}
 	ip := header.IPv4(h)
-	if !ip.IsValid(pkt.Data.Size()) {
-		return tcpip.ErrInvalidOptionValue
-	}
 
 	// Always set the total length.
-	ip.SetTotalLength(uint16(pkt.Data.Size()))
+	pktSize := pkt.Data.Size()
+	ip.SetTotalLength(uint16(pktSize))
 
 	// Set the source address when zero.
-	if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
+	if ip.SourceAddress() == header.IPv4Any {
 		ip.SetSourceAddress(r.LocalAddress)
 	}
 
-	// Set the destination. If the packet already included a destination,
-	// it will be part of the route.
+	// Set the destination. If the packet already included a destination, it will
+	// be part of the route anyways.
 	ip.SetDestinationAddress(r.RemoteAddress)
 
 	// Set the packet ID when zero.
@@ -366,32 +428,73 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBu
 	ip.SetChecksum(0)
 	ip.SetChecksum(^ip.CalculateChecksum())
 
-	if r.Loop&stack.PacketLoop != 0 {
-		e.HandlePacket(r, pkt.Clone())
+	// Populate the packet buffer's network header and don't allow an invalid
+	// packet to be sent.
+	//
+	// Note that parsing only makes sure that the packet is well formed as per the
+	// wire format. We also want to check if the header's fields are valid before
+	// sending the packet.
+	if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
+		return tcpip.ErrMalformedHeader
 	}
-	if r.Loop&stack.PacketOut == 0 {
-		return nil
-	}
-
-	r.Stats().IP.PacketsSent.Increment()
 
-	return e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+	return e.writePacket(r, nil /* gso */, pkt)
 }
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.IPv4(pkt.NetworkHeader().View())
 	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
+	// There has been some confusion regarding verifying checksums. We need
+	// just look for negative 0 (0xffff) as the checksum, as it's not possible to
+	// get positive 0 (0) for the checksum. Some bad implementations could get it
+	// when doing entry replacement in the early days of the Internet,
+	// however the lore that one needs to check for both persists.
+	//
+	// RFC 1624 section 1 describes the source of this confusion as:
+	//     [the partial recalculation method described in RFC 1071] computes a
+	//     result for certain cases that differs from the one obtained from
+	//     scratch (one's complement of one's complement sum of the original
+	//     fields).
+	//
+	// However RFC 1624 section 5 clarifies that if using the verification method
+	// "recommended by RFC 1071, it does not matter if an intermediate system
+	// generated a -0 instead of +0".
+	//
+	// RFC1071 page 1 specifies the verification method as:
+	//	  (3)  To check a checksum, the 1's complement sum is computed over the
+	//        same set of octets, including the checksum field.  If the result
+	//        is all 1 bits (-0 in 1's complement arithmetic), the check
+	//        succeeds.
+	if h.CalculateChecksum() != 0xffff {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// As per RFC 1122 section 3.2.1.3:
+	//   When a host sends any datagram, the IP source address MUST
+	//   be one of its own IP addresses (but not a broadcast or
+	//   multicast address).
+	if r.IsOutboundBroadcast() || header.IsV4MulticastAddress(r.RemoteAddress) {
+		r.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
+
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
-	ipt := e.stack.IPTables()
+	ipt := e.protocol.stack.IPTables()
 	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
 		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesInputDropped.Increment()
 		return
 	}
 
@@ -404,30 +507,59 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			return
 		}
 		// The packet is a fragment, let's try to reassemble it.
-		last := h.FragmentOffset() + uint16(pkt.Data.Size()) - 1
-		// Drop the packet if the fragmentOffset is incorrect. i.e the
-		// combination of fragmentOffset and pkt.Data.size() causes a
-		// wrap around resulting in last being less than the offset.
-		if last < h.FragmentOffset() {
+		start := h.FragmentOffset()
+		// Drop the fragment if the size of the reassembled payload would exceed the
+		// maximum payload size.
+		//
+		// Note that this addition doesn't overflow even on 32bit architecture
+		// because pkt.Data.Size() should not exceed 65535 (the max IP datagram
+		// size). Otherwise the packet would've been rejected as invalid before
+		// reaching here.
+		if int(start)+pkt.Data.Size() > header.IPv4MaximumPayloadSize {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
 			return
 		}
+
+		// Set up a callback in case we need to send a Time Exceeded Message, as per
+		// RFC 792:
+		//
+		//   If a host reassembling a fragmented datagram cannot complete the
+		//   reassembly due to missing fragments within its time limit it discards
+		//   the datagram, and it may send a time exceeded message.
+		//
+		//   If fragment zero is not available then no time exceeded need be sent at
+		//   all.
+		var releaseCB func(bool)
+		if start == 0 {
+			pkt := pkt.Clone()
+			r := r.Clone()
+			releaseCB = func(timedOut bool) {
+				if timedOut {
+					_ = e.protocol.returnError(&r, &icmpReasonReassemblyTimeout{}, pkt)
+				}
+				r.Release()
+			}
+		}
+
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.protocol.fragmentation.Process(
+		proto := h.Protocol()
+		pkt.Data, _, ready, err = e.protocol.fragmentation.Process(
 			// As per RFC 791 section 2.3, the identification value is unique
 			// for a source-destination pair and protocol.
 			fragmentation.FragmentID{
 				Source:      h.SourceAddress(),
 				Destination: h.DestinationAddress(),
 				ID:          uint32(h.ID()),
-				Protocol:    h.Protocol(),
+				Protocol:    proto,
 			},
-			h.FragmentOffset(),
-			last,
+			start,
+			start+uint16(pkt.Data.Size())-1,
 			h.More(),
+			proto,
 			pkt.Data,
+			releaseCB,
 		)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -438,27 +570,165 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			return
 		}
 	}
+
+	r.Stats().IP.PacketsDelivered.Increment()
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
+		// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
+		// headers, the setting of the transport number here should be
+		// unnecessary and removed.
+		pkt.TransportProtocolNumber = p
 		e.handleICMP(r, pkt)
 		return
 	}
-	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, pkt)
+
+	switch res := e.dispatcher.DeliverTransportPacket(r, p, pkt); res {
+	case stack.TransportPacketHandled:
+	case stack.TransportPacketDestinationPortUnreachable:
+		// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
+		//   Unreachable messages with code:
+		//     3 (Port Unreachable), when the designated transport protocol
+		//     (e.g., UDP) is unable to demultiplex the datagram but has no
+		//     protocol mechanism to inform the sender.
+		_ = e.protocol.returnError(r, &icmpReasonPortUnreachable{}, pkt)
+	case stack.TransportPacketProtocolUnreachable:
+		// As per RFC: 1122 Section 3.2.2.1
+		//   A host SHOULD generate Destination Unreachable messages with code:
+		//     2 (Protocol Unreachable), when the designated transport protocol
+		//     is not supported
+		_ = e.protocol.returnError(r, &icmpReasonProtoUnreachable{}, pkt)
+	default:
+		panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
+	}
 }
 
 // Close cleans up resources associated with the endpoint.
-func (e *endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	e.disableLocked()
+	e.mu.addressableEndpointState.Cleanup()
+}
+
+// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
+}
+
+// RemovePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
+}
+
+// MainAddress implements stack.AddressableEndpoint.
+func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.MainAddress()
+}
+
+// AcquireAssignedAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	loopback := e.nic.IsLoopback()
+	addressEndpoint := e.mu.addressableEndpointState.ReadOnly().AddrOrMatching(localAddr, allowTemp, func(addressEndpoint stack.AddressEndpoint) bool {
+		subnet := addressEndpoint.AddressWithPrefix().Subnet()
+		// IPv4 has a notion of a subnet broadcast address and considers the
+		// loopback interface bound to an address's whole subnet (on linux).
+		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
+	})
+	if addressEndpoint != nil {
+		return addressEndpoint
+	}
+
+	if !allowTemp {
+		return nil
+	}
+
+	addr := localAddr.WithPrefix()
+	addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(addr, tempPEB)
+	if err != nil {
+		// AddAddress only returns an error if the address is already assigned,
+		// but we just checked above if the address exists so we expect no error.
+		panic(fmt.Sprintf("e.mu.addressableEndpointState.AddAndAcquireTemporaryAddress(%s, %d): %s", addr, tempPEB, err))
+	}
+	return addressEndpoint
+}
+
+// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
+}
+
+// PrimaryAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PrimaryAddresses()
+}
+
+// PermanentAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PermanentAddresses()
+}
+
+// JoinGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	if !header.IsV4MulticastAddress(addr) {
+		return false, tcpip.ErrBadAddress
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.JoinGroup(addr)
+}
+
+// LeaveGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.LeaveGroup(addr)
+}
+
+// IsInGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.IsInGroup(addr)
+}
+
+var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
+var _ stack.NetworkProtocol = (*protocol)(nil)
 
 type protocol struct {
-	ids    []uint32
-	hashIV uint32
+	stack *stack.Stack
 
 	// defaultTTL is the current default TTL for the protocol. Only the
-	// uint8 portion of it is meaningful and it must be accessed
-	// atomically.
+	// uint8 portion of it is meaningful.
+	//
+	// Must be accessed using atomic operations.
 	defaultTTL uint32
 
+	// forwarding is set to 1 when the protocol has forwarding enabled and 0
+	// when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	forwarding uint32
+
+	ids    []uint32
+	hashIV uint32
+
 	fragmentation *fragmentation.Fragmentation
 }
 
@@ -484,10 +754,10 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -495,7 +765,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
@@ -521,61 +791,80 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
+	if ok := parse.IPv4(pkt); !ok {
 		return 0, false, false
 	}
-	ipHdr := header.IPv4(hdr)
 
-	// Header may have options, determine the true header length.
-	headerLen := int(ipHdr.HeaderLength())
-	if headerLen < header.IPv4MinimumSize {
-		// TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
-		// order for the packet to be valid. Figure out if we want to reject this
-		// case.
-		headerLen = header.IPv4MinimumSize
+	ipHdr := header.IPv4(pkt.NetworkHeader().View())
+	return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
+}
+
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) Forwarding() bool {
+	return uint8(atomic.LoadUint32(&p.forwarding)) == 1
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) SetForwarding(v bool) {
+	if v {
+		atomic.StoreUint32(&p.forwarding, 1)
+	} else {
+		atomic.StoreUint32(&p.forwarding, 0)
 	}
-	hdr, ok = pkt.NetworkHeader().Consume(headerLen)
-	if !ok {
-		return 0, false, false
+}
+
+// calculateNetworkMTU calculates the network-layer payload MTU based on the
+// link-layer payload mtu.
+func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, *tcpip.Error) {
+	if linkMTU < header.IPv4MinimumMTU {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	// As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
+	// length:
+	//   The maximal internet header is 60 octets, and a typical internet header
+	//   is 20 octets, allowing a margin for headers of higher level protocols.
+	if networkHeaderSize > header.IPv4MaximumHeaderSize {
+		return 0, tcpip.ErrMalformedHeader
 	}
-	ipHdr = header.IPv4(hdr)
 
-	// If this is a fragment, don't bother parsing the transport header.
-	parseTransportHeader := true
-	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
-		parseTransportHeader = false
+	networkMTU := linkMTU
+	if networkMTU > MaxTotalSize {
+		networkMTU = MaxTotalSize
 	}
 
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
-	return ipHdr.TransportProtocol(), parseTransportHeader, true
+	return networkMTU - uint32(networkHeaderSize), nil
 }
 
-// calculateMTU calculates the network-layer payload MTU based on the link-layer
-// payload mtu.
-func calculateMTU(mtu uint32) uint32 {
-	if mtu > MaxTotalSize {
-		mtu = MaxTotalSize
-	}
-	return mtu - header.IPv4MinimumSize
+func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32, gso *stack.GSO) bool {
+	payload := pkt.TransportHeader().View().Size() + pkt.Data.Size()
+	return (gso == nil || gso.Type == stack.GSONone) && uint32(payload) > networkMTU
+}
+
+// addressToUint32 translates an IPv4 address into its little endian uint32
+// representation.
+//
+// This function does the same thing as binary.LittleEndian.Uint32 but operates
+// on a tcpip.Address (a string) without the need to convert it to a byte slice,
+// which would cause an allocation.
+func addressToUint32(addr tcpip.Address) uint32 {
+	_ = addr[3] // bounds check hint to compiler
+	return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
 }
 
 // hashRoute calculates a hash value for the given route. It uses the source &
-// destination address, the transport protocol number, and a random initial
-// value (generated once on initialization) to generate the hash.
+// destination address, the transport protocol number and a 32-bit number to
+// generate the hash.
 func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
-	t := r.LocalAddress
-	a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
-	t = r.RemoteAddress
-	b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	a := addressToUint32(r.LocalAddress)
+	b := addressToUint32(r.RemoteAddress)
 	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
 }
 
 // NewProtocol returns an IPv4 network protocol.
-func NewProtocol() stack.NetworkProtocol {
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
 	ids := make([]uint32, buckets)
 
 	// Randomly initialize hashIV and the ids.
@@ -586,9 +875,33 @@ func NewProtocol() stack.NetworkProtocol {
 	hashIV := r[buckets]
 
 	return &protocol{
+		stack:         s,
 		ids:           ids,
 		hashIV:        hashIV,
 		defaultTTL:    DefaultTTL,
-		fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+		fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock()),
+	}
+}
+
+func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
+	fragPkt, offset, copied, more := pf.BuildNextFragment()
+	fragPkt.NetworkProtocolNumber = ProtocolNumber
+
+	originalIPHeaderLength := len(originalIPHeader)
+	nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
+
+	if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
+		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
 	}
+
+	flags := originalIPHeader.Flags()
+	if more {
+		flags |= header.IPv4FlagMoreFragments
+	}
+	nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
+	nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
+	nextFragIPHeader.SetChecksum(0)
+	nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())
+
+	return fragPkt, more
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 197e3bc51..dbe0935be 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -15,32 +15,42 @@
 package ipv4_test
 
 import (
-	"bytes"
+	"context"
 	"encoding/hex"
 	"fmt"
-	"math/rand"
+	"math"
+	"net"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+const (
+	extraHeaderReserve = 50
+	defaultMTU         = 65536
+)
+
 func TestExcludeBroadcast(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 
-	const defaultMTU = 65536
 	ep := stack.LinkEndpoint(channel.New(256, defaultMTU, ""))
 	if testing.Verbose() {
 		ep = sniffer.New(ep)
@@ -92,38 +102,296 @@ func TestExcludeBroadcast(t *testing.T) {
 	})
 }
 
-// makeRandPkt generates a randomize packet. hdrLength indicates how much
-// data should already be in the header before WritePacket. extraLength
-// indicates how much extra space should be in the header. The payload is made
-// from many Views of the sizes listed in viewSizes.
-func makeRandPkt(hdrLength int, extraLength int, viewSizes []int) *stack.PacketBuffer {
-	var views []buffer.View
-	totalLength := 0
-	for _, s := range viewSizes {
-		newView := buffer.NewView(s)
-		rand.Read(newView)
-		views = append(views, newView)
-		totalLength += s
-	}
-
-	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: hdrLength + extraLength,
-		Data:               buffer.NewVectorisedView(totalLength, views),
-	})
-	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
-	if _, err := rand.Read(pkt.TransportHeader().Push(hdrLength)); err != nil {
-		panic(fmt.Sprintf("rand.Read: %s", err))
+// TestIPv4Sanity sends IP/ICMP packets with various problems to the stack and
+// checks the response.
+func TestIPv4Sanity(t *testing.T) {
+	const (
+		ttl            = 255
+		nicID          = 1
+		randomSequence = 123
+		randomIdent    = 42
+	)
+	var (
+		ipv4Addr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+			PrefixLen: 24,
+		}
+		remoteIPv4Addr = tcpip.Address(net.ParseIP("10.0.0.1").To4())
+	)
+
+	tests := []struct {
+		name              string
+		headerLength      uint8 // value of 0 means "use correct size"
+		badHeaderChecksum bool
+		maxTotalLength    uint16
+		transportProtocol uint8
+		TTL               uint8
+		shouldFail        bool
+		expectICMP        bool
+		ICMPType          header.ICMPv4Type
+		ICMPCode          header.ICMPv4Code
+		options           []byte
+	}{
+		{
+			name:              "valid",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+		},
+		{
+			name:              "bad header checksum",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			badHeaderChecksum: true,
+			shouldFail:        true,
+		},
+		// The TTL tests check that we are not rejecting an incoming packet
+		// with a zero or one TTL, which has been a point of confusion in the
+		// past as RFC 791 says: "If this field contains the value zero, then the
+		// datagram must be destroyed". However RFC 1122 section 3.2.1.7 clarifies
+		// for the case of the destination host, stating as follows.
+		//
+		//      A host MUST NOT send a datagram with a Time-to-Live (TTL)
+		//      value of zero.
+		//
+		//      A host MUST NOT discard a datagram just because it was
+		//      received with TTL less than 2.
+		{
+			name:              "zero TTL",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               0,
+			shouldFail:        false,
+		},
+		{
+			name:              "one TTL",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               1,
+			shouldFail:        false,
+		},
+		{
+			name:              "End options",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{0, 0, 0, 0},
+		},
+		{
+			name:              "NOP options",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{1, 1, 1, 1},
+		},
+		{
+			name:              "NOP and End options",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			options:           []byte{1, 1, 0, 0},
+		},
+		{
+			name:              "bad header length",
+			headerLength:      header.IPv4MinimumSize - 1,
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (0)",
+			maxTotalLength:    0,
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (ip - 1)",
+			maxTotalLength:    uint16(header.IPv4MinimumSize - 1),
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad total length (ip + icmp - 1)",
+			maxTotalLength:    uint16(header.IPv4MinimumSize + header.ICMPv4MinimumSize - 1),
+			transportProtocol: uint8(header.ICMPv4ProtocolNumber),
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        false,
+		},
+		{
+			name:              "bad protocol",
+			maxTotalLength:    ipv4.MaxTotalSize,
+			transportProtocol: 99,
+			TTL:               ttl,
+			shouldFail:        true,
+			expectICMP:        true,
+			ICMPType:          header.ICMPv4DstUnreachable,
+			ICMPCode:          header.ICMPv4ProtoUnreachable,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4},
+			})
+			// We expect at most a single packet in response to our ICMP Echo Request.
+			e := channel.New(1, defaultMTU, "")
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr}
+			if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, ipv4ProtoAddr, err)
+			}
+
+			// Default routes for IPv4 so ICMP can find a route to the remote
+			// node when attempting to send the ICMP Echo Reply.
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// Round up the header size to the next multiple of 4 as RFC 791, page 11
+			// says: "Internet Header Length is the length of the internet header
+			// in 32 bit words..." and on page 23: "The internet header padding is
+			// used to ensure that the internet header ends on a 32 bit boundary."
+			ipHeaderLength := ((header.IPv4MinimumSize + len(test.options)) + header.IPv4IHLStride - 1) & ^(header.IPv4IHLStride - 1)
+
+			if ipHeaderLength > header.IPv4MaximumHeaderSize {
+				t.Fatalf("too many bytes in options: got = %d, want <= %d ", ipHeaderLength, header.IPv4MaximumHeaderSize)
+			}
+			totalLen := uint16(ipHeaderLength + header.ICMPv4MinimumSize)
+			hdr := buffer.NewPrependable(int(totalLen))
+			icmp := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+
+			// Specify ident/seq to make sure we get the same in the response.
+			icmp.SetIdent(randomIdent)
+			icmp.SetSequence(randomSequence)
+			icmp.SetType(header.ICMPv4Echo)
+			icmp.SetCode(header.ICMPv4UnusedCode)
+			icmp.SetChecksum(0)
+			icmp.SetChecksum(^header.Checksum(icmp, 0))
+			ip := header.IPv4(hdr.Prepend(ipHeaderLength))
+			if test.maxTotalLength < totalLen {
+				totalLen = test.maxTotalLength
+			}
+			ip.Encode(&header.IPv4Fields{
+				IHL:         uint8(ipHeaderLength),
+				TotalLength: totalLen,
+				Protocol:    test.transportProtocol,
+				TTL:         test.TTL,
+				SrcAddr:     remoteIPv4Addr,
+				DstAddr:     ipv4Addr.Address,
+			})
+			if n := copy(ip.Options(), test.options); n != len(test.options) {
+				t.Fatalf("options larger than available space: copied %d/%d bytes", n, len(test.options))
+			}
+			// Override the correct value if the test case specified one.
+			if test.headerLength != 0 {
+				ip.SetHeaderLength(test.headerLength)
+			}
+			ip.SetChecksum(0)
+			ipHeaderChecksum := ip.CalculateChecksum()
+			if test.badHeaderChecksum {
+				ipHeaderChecksum += 42
+			}
+			ip.SetChecksum(^ipHeaderChecksum)
+			requestPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+				Data: hdr.View().ToVectorisedView(),
+			})
+			e.InjectInbound(header.IPv4ProtocolNumber, requestPkt)
+			reply, ok := e.Read()
+			if !ok {
+				if test.shouldFail {
+					if test.expectICMP {
+						t.Fatal("expected ICMP error response missing")
+					}
+					return // Expected silent failure.
+				}
+				t.Fatal("expected ICMP echo reply missing")
+			}
+
+			// Check the route that brought the packet to us.
+			if reply.Route.LocalAddress != ipv4Addr.Address {
+				t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", reply.Route.LocalAddress, ipv4Addr.Address)
+			}
+			if reply.Route.RemoteAddress != remoteIPv4Addr {
+				t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", reply.Route.RemoteAddress, remoteIPv4Addr)
+			}
+
+			// Make sure it's all in one buffer.
+			vv := buffer.NewVectorisedView(reply.Pkt.Size(), reply.Pkt.Views())
+			replyIPHeader := header.IPv4(vv.ToView())
+
+			// At this stage we only know it's an IP header so verify that much.
+			checker.IPv4(t, replyIPHeader,
+				checker.SrcAddr(ipv4Addr.Address),
+				checker.DstAddr(remoteIPv4Addr),
+			)
+
+			// All expected responses are ICMP packets.
+			if got, want := replyIPHeader.Protocol(), uint8(header.ICMPv4ProtocolNumber); got != want {
+				t.Fatalf("not ICMP response, got protocol %d, want = %d", got, want)
+			}
+			replyICMPHeader := header.ICMPv4(replyIPHeader.Payload())
+
+			// Sanity check the response.
+			switch replyICMPHeader.Type() {
+			case header.ICMPv4DstUnreachable:
+				checker.IPv4(t, replyIPHeader,
+					checker.IPFullLength(uint16(header.IPv4MinimumSize+header.ICMPv4MinimumSize+requestPkt.Size())),
+					checker.IPv4HeaderLength(header.IPv4MinimumSize),
+					checker.ICMPv4(
+						checker.ICMPv4Code(test.ICMPCode),
+						checker.ICMPv4Checksum(),
+						checker.ICMPv4Payload([]byte(hdr.View())),
+					),
+				)
+				if !test.shouldFail || !test.expectICMP {
+					t.Fatalf("unexpected packet rejection, got ICMP error packet type %d, code %d",
+						header.ICMPv4DstUnreachable, replyICMPHeader.Code())
+				}
+				return
+			case header.ICMPv4EchoReply:
+				checker.IPv4(t, replyIPHeader,
+					checker.IPv4HeaderLength(ipHeaderLength),
+					checker.IPv4Options(test.options),
+					checker.IPFullLength(uint16(requestPkt.Size())),
+					checker.ICMPv4(
+						checker.ICMPv4Code(header.ICMPv4UnusedCode),
+						checker.ICMPv4Seq(randomSequence),
+						checker.ICMPv4Ident(randomIdent),
+						checker.ICMPv4Checksum(),
+					),
+				)
+				if test.shouldFail {
+					t.Fatalf("unexpected Echo Reply packet\n")
+				}
+			default:
+				t.Fatalf("unexpected ICMP response, got type %d, want = %d or %d",
+					replyICMPHeader.Type(), header.ICMPv4EchoReply, header.ICMPv4DstUnreachable)
+			}
+		})
 	}
-	return pkt
 }
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
-	t.Helper()
-	// Make a complete array of the sourcePacketInfo packet.
-	source := header.IPv4(packets[0].NetworkHeader().View()[:header.IPv4MinimumSize])
-	vv := buffer.NewVectorisedView(sourcePacketInfo.Size(), sourcePacketInfo.Views())
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32, wantFragments []fragmentInfo, proto tcpip.TransportProtocolNumber) error {
+	// Make a complete array of the sourcePacket packet.
+	source := header.IPv4(packets[0].NetworkHeader().View())
+	vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
 	source = append(source, vv.ToView()...)
 
 	// Make a copy of the IP header, which will be modified in some fields to make
@@ -132,361 +400,929 @@ func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketI
 	sourceCopy.SetChecksum(0)
 	sourceCopy.SetFlagsFragmentOffset(0, 0)
 	sourceCopy.SetTotalLength(0)
-	var offset uint16
 	// Build up an array of the bytes sent.
-	var reassembledPayload []byte
+	var reassembledPayload buffer.VectorisedView
 	for i, packet := range packets {
 		// Confirm that the packet is valid.
 		allBytes := buffer.NewVectorisedView(packet.Size(), packet.Views())
-		ip := header.IPv4(allBytes.ToView())
-		if !ip.IsValid(len(ip)) {
-			t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
+		fragmentIPHeader := header.IPv4(allBytes.ToView())
+		if !fragmentIPHeader.IsValid(len(fragmentIPHeader)) {
+			return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeader))
 		}
-		if got, want := ip.CalculateChecksum(), uint16(0xffff); got != want {
-			t.Errorf("ip.CalculateChecksum() got %#x, want %#x", got, want)
+		if got := len(fragmentIPHeader); got > int(mtu) {
+			return fmt.Errorf("fragment #%d: got len(fragmentIPHeader) = %d, want <= %d", i, got, mtu)
 		}
-		if got, want := len(ip), int(mtu); got > want {
-			t.Errorf("fragment is too large, got %d want %d", got, want)
+		if got := fragmentIPHeader.TransportProtocol(); got != proto {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeader.TransportProtocol() = %d, want = %d", i, got, uint8(proto))
 		}
-		if i == 0 {
-			got := packet.NetworkHeader().View().Size() + packet.TransportHeader().View().Size()
-			// sourcePacketInfo does not have NetworkHeader added, simulate one.
-			want := header.IPv4MinimumSize + sourcePacketInfo.TransportHeader().View().Size()
-			// Check that it kept the transport header in packet.TransportHeader if
-			// it fits in the first fragment.
-			if want < int(mtu) && got != want {
-				t.Errorf("first fragment hdr parts should have unmodified length if possible: got %d, want %d", got, want)
-			}
+		if got := packet.AvailableHeaderBytes(); got != extraHeaderReserve {
+			return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
 		}
-		if got, want := packet.AvailableHeaderBytes(), sourcePacketInfo.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
-			t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
+		if got, want := packet.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber; got != want {
+			return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, got, want)
 		}
-		if got, want := packet.NetworkProtocolNumber, sourcePacketInfo.NetworkProtocolNumber; got != want {
-			t.Errorf("fragment #%d has wrong network protocol number: got %d, want %d", i, got, want)
+		if got, want := fragmentIPHeader.CalculateChecksum(), uint16(0xffff); got != want {
+			return fmt.Errorf("fragment #%d: got ip.CalculateChecksum() = %#x, want = %#x", i, got, want)
 		}
-		if i < len(packets)-1 {
-			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
+		if wantFragments[i].more {
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, wantFragments[i].offset)
 		} else {
-			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, offset)
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, wantFragments[i].offset)
 		}
-		reassembledPayload = append(reassembledPayload, ip.Payload()...)
-		offset += ip.TotalLength() - uint16(ip.HeaderLength())
+		reassembledPayload.AppendView(packet.TransportHeader().View())
+		reassembledPayload.Append(packet.Data)
 		// Clear out the checksum and length from the ip because we can't compare
 		// it.
-		sourceCopy.SetTotalLength(uint16(len(ip)))
+		sourceCopy.SetTotalLength(wantFragments[i].payloadSize + header.IPv4MinimumSize)
 		sourceCopy.SetChecksum(0)
 		sourceCopy.SetChecksum(^sourceCopy.CalculateChecksum())
-		if !bytes.Equal(ip[:ip.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]) {
-			t.Errorf("ip[:ip.HeaderLength()] got:\n%s\nwant:\n%s", hex.Dump(ip[:ip.HeaderLength()]), hex.Dump(sourceCopy[:sourceCopy.HeaderLength()]))
-		}
-	}
-	expected := source[source.HeaderLength():]
-	if !bytes.Equal(reassembledPayload, expected) {
-		t.Errorf("reassembledPayload got:\n%s\nwant:\n%s", hex.Dump(reassembledPayload), hex.Dump(expected))
-	}
-}
-
-type errorChannel struct {
-	*channel.Endpoint
-	Ch                    chan *stack.PacketBuffer
-	packetCollectorErrors []*tcpip.Error
-}
-
-// newErrorChannel creates a new errorChannel endpoint. Each call to WritePacket
-// will return successive errors from packetCollectorErrors until the list is
-// empty and then return nil each time.
-func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
-	return &errorChannel{
-		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan *stack.PacketBuffer, size),
-		packetCollectorErrors: packetCollectorErrors,
-	}
-}
-
-// Drain removes all outbound packets from the channel and counts them.
-func (e *errorChannel) Drain() int {
-	c := 0
-	for {
-		select {
-		case <-e.Ch:
-			c++
-		default:
-			return c
+		if diff := cmp.Diff(fragmentIPHeader[:fragmentIPHeader.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]); diff != "" {
+			return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
 		}
 	}
-}
 
-// WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
-	select {
-	case e.Ch <- pkt:
-	default:
+	expected := buffer.View(source[source.HeaderLength():])
+	if diff := cmp.Diff(expected, reassembledPayload.ToView()); diff != "" {
+		return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
 	}
 
-	nextError := (*tcpip.Error)(nil)
-	if len(e.packetCollectorErrors) > 0 {
-		nextError = e.packetCollectorErrors[0]
-		e.packetCollectorErrors = e.packetCollectorErrors[1:]
-	}
-	return nextError
+	return nil
 }
 
-type context struct {
-	stack.Route
-	linkEP *errorChannel
+type fragmentInfo struct {
+	offset      uint16
+	more        bool
+	payloadSize uint16
 }
 
-func buildContext(t *testing.T, packetCollectorErrors []*tcpip.Error, mtu uint32) context {
-	// Make the packet and write it.
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
-	})
-	ep := newErrorChannel(100 /* Enough for all tests. */, mtu, "", packetCollectorErrors)
-	s.CreateNIC(1, ep)
-	const (
-		src = "\x10\x00\x00\x01"
-		dst = "\x10\x00\x00\x02"
-	)
-	s.AddAddress(1, ipv4.ProtocolNumber, src)
+var fragmentationTests = []struct {
+	description           string
+	mtu                   uint32
+	gso                   *stack.GSO
+	transportHeaderLength int
+	payloadSize           int
+	wantFragments         []fragmentInfo
+}{
 	{
-		subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast))
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{
-			Destination: subnet,
-			NIC:         1,
-		}})
-	}
-	r, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
-	if err != nil {
-		t.Fatalf("s.FindRoute got %v, want %v", err, nil)
-	}
-	return context{
-		Route:  r,
-		linkEP: ep,
-	}
+		description:           "No fragmentation",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1000, more: false},
+		},
+	},
+	{
+		description:           "Fragmented",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           2000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 744, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with the minimum mtu",
+		mtu:                   header.IPv4MinimumMTU,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           100,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 48, more: true},
+			{offset: 48, payloadSize: 48, more: true},
+			{offset: 96, payloadSize: 4, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with mtu not a multiple of 8",
+		mtu:                   header.IPv4MinimumMTU + 1,
+		gso:                   nil,
+		transportHeaderLength: 0,
+		payloadSize:           100,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 48, more: true},
+			{offset: 48, payloadSize: 48, more: true},
+			{offset: 96, payloadSize: 4, more: false},
+		},
+	},
+	{
+		description:           "No fragmentation with big header",
+		mtu:                   2000,
+		gso:                   nil,
+		transportHeaderLength: 100,
+		payloadSize:           1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1100, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with gso none",
+		mtu:                   1280,
+		gso:                   &stack.GSO{Type: stack.GSONone},
+		transportHeaderLength: 0,
+		payloadSize:           1400,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 144, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with big header",
+		mtu:                   1280,
+		gso:                   nil,
+		transportHeaderLength: 100,
+		payloadSize:           1200,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1256, more: true},
+			{offset: 1256, payloadSize: 44, more: false},
+		},
+	},
+	{
+		description:           "Fragmented with MTU smaller than header",
+		mtu:                   300,
+		gso:                   nil,
+		transportHeaderLength: 1000,
+		payloadSize:           500,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 280, more: true},
+			{offset: 280, payloadSize: 280, more: true},
+			{offset: 560, payloadSize: 280, more: true},
+			{offset: 840, payloadSize: 280, more: true},
+			{offset: 1120, payloadSize: 280, more: true},
+			{offset: 1400, payloadSize: 100, more: false},
+		},
+	},
 }
 
-func TestFragmentation(t *testing.T) {
-	var manyPayloadViewsSizes [1000]int
-	for i := range manyPayloadViewsSizes {
-		manyPayloadViewsSizes[i] = 7
-	}
-	fragTests := []struct {
-		description       string
-		mtu               uint32
-		gso               *stack.GSO
-		hdrLength         int
-		extraLength       int
-		payloadViewsSizes []int
-		expectedFrags     int
-	}{
-		{"NoFragmentation", 2000, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 1},
-		{"NoFragmentationWithBigHeader", 2000, &stack.GSO{}, 16, header.IPv4MinimumSize, []int{1000}, 1},
-		{"Fragmented", 800, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithGsoNil", 800, nil, 0, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithManyViews", 300, &stack.GSO{}, 0, header.IPv4MinimumSize, manyPayloadViewsSizes[:], 25},
-		{"FragmentedWithManyViewsAndPrependableBytes", 300, &stack.GSO{}, 0, header.IPv4MinimumSize + 55, manyPayloadViewsSizes[:], 25},
-		{"FragmentedWithBigHeader", 800, &stack.GSO{}, 20, header.IPv4MinimumSize, []int{1000}, 2},
-		{"FragmentedWithBigHeaderAndPrependableBytes", 800, &stack.GSO{}, 20, header.IPv4MinimumSize + 66, []int{1000}, 2},
-		{"FragmentedWithMTUSmallerThanHeaderAndPrependableBytes", 300, &stack.GSO{}, 1000, header.IPv4MinimumSize + 77, []int{500}, 6},
-	}
-
-	for _, ft := range fragTests {
+func TestFragmentationWritePacket(t *testing.T) {
+	const ttl = 42
+
+	for _, ft := range fragmentationTests {
 		t.Run(ft.description, func(t *testing.T) {
-			pkt := makeRandPkt(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+			r := buildRoute(t, ep)
+			pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
 			source := pkt.Clone()
-			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+			err := r.WritePacket(ft.gso, stack.NetworkHeaderParams{
 				Protocol: tcp.ProtocolNumber,
-				TTL:      42,
+				TTL:      ttl,
 				TOS:      stack.DefaultTOS,
 			}, pkt)
 			if err != nil {
-				t.Errorf("err got %v, want %v", err, nil)
+				t.Fatalf("r.WritePacket(_, _, _) = %s", err)
 			}
-
-			var results []*stack.PacketBuffer
-		L:
-			for {
-				select {
-				case pi := <-c.linkEP.Ch:
-					results = append(results, pi)
-				default:
-					break L
-				}
+			if got := len(ep.WrittenPackets); got != len(ft.wantFragments) {
+				t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, len(ft.wantFragments))
 			}
-
-			if got, want := len(results), ft.expectedFrags; got != want {
-				t.Errorf("len(result) got %d, want %d", got, want)
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != len(ft.wantFragments) {
+				t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
 			}
-			if got, want := len(results), int(c.Route.Stats().IP.PacketsSent.Value()); got != want {
-				t.Errorf("no errors yet len(result) got %d, want %d", got, want)
+			if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+				t.Error(err)
 			}
-			compareFragments(t, results, source, ft.mtu)
 		})
 	}
 }
 
-// TestFragmentationErrors checks that errors are returned from write packet
+func TestFragmentationWritePackets(t *testing.T) {
+	const ttl = 42
+	writePacketsTests := []struct {
+		description  string
+		insertBefore int
+		insertAfter  int
+	}{
+		{
+			description:  "Single packet",
+			insertBefore: 0,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet before",
+			insertBefore: 1,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet after",
+			insertBefore: 0,
+			insertAfter:  1,
+		},
+		{
+			description:  "With packet before and after",
+			insertBefore: 1,
+			insertAfter:  1,
+		},
+	}
+	tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv4MinimumSize, []int{1}, header.IPv4ProtocolNumber)
+
+	for _, test := range writePacketsTests {
+		t.Run(test.description, func(t *testing.T) {
+			for _, ft := range fragmentationTests {
+				t.Run(ft.description, func(t *testing.T) {
+					var pkts stack.PacketBufferList
+					for i := 0; i < test.insertBefore; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+					pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+					pkts.PushBack(pkt.Clone())
+					for i := 0; i < test.insertAfter; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+
+					ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+					r := buildRoute(t, ep)
+
+					wantTotalPackets := len(ft.wantFragments) + test.insertBefore + test.insertAfter
+					n, err := r.WritePackets(ft.gso, pkts, stack.NetworkHeaderParams{
+						Protocol: tcp.ProtocolNumber,
+						TTL:      ttl,
+						TOS:      stack.DefaultTOS,
+					})
+					if err != nil {
+						t.Errorf("got WritePackets(_, _, _) = (_, %s), want = (_, nil)", err)
+					}
+					if n != wantTotalPackets {
+						t.Errorf("got WritePackets(_, _, _) = (%d, _), want = (%d, _)", n, wantTotalPackets)
+					}
+					if got := len(ep.WrittenPackets); got != wantTotalPackets {
+						t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.PacketsSent.Value()); got != wantTotalPackets {
+						t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != 0 {
+						t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+					}
+
+					if wantTotalPackets == 0 {
+						return
+					}
+
+					fragments := ep.WrittenPackets[test.insertBefore : len(ft.wantFragments)+test.insertBefore]
+					if err := compareFragments(fragments, pkt, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+						t.Error(err)
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestFragmentationErrors checks that errors are returned from WritePacket
 // correctly.
 func TestFragmentationErrors(t *testing.T) {
-	fragTests := []struct {
+	const ttl = 42
+
+	tests := []struct {
 		description           string
 		mtu                   uint32
-		hdrLength             int
-		payloadViewsSizes     []int
-		packetCollectorErrors []*tcpip.Error
+		transportHeaderLength int
+		payloadSize           int
+		allowPackets          int
+		outgoingErrors        int
+		mockError             *tcpip.Error
+		wantError             *tcpip.Error
 	}{
-		{"NoFrag", 2000, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
-		{"ErrorOnFirstFrag", 500, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
-		{"ErrorOnSecondFrag", 500, 0, []int{1000}, []*tcpip.Error{nil, tcpip.ErrAborted}},
-		{"ErrorOnFirstFragMTUSmallerThanHdr", 500, 1000, []int{500}, []*tcpip.Error{tcpip.ErrAborted}},
+		{
+			description:           "No frag",
+			mtu:                   2000,
+			payloadSize:           1000,
+			transportHeaderLength: 0,
+			allowPackets:          0,
+			outgoingErrors:        1,
+			mockError:             tcpip.ErrAborted,
+			wantError:             tcpip.ErrAborted,
+		},
+		{
+			description:           "Error on first frag",
+			mtu:                   500,
+			payloadSize:           1000,
+			transportHeaderLength: 0,
+			allowPackets:          0,
+			outgoingErrors:        3,
+			mockError:             tcpip.ErrAborted,
+			wantError:             tcpip.ErrAborted,
+		},
+		{
+			description:           "Error on second frag",
+			mtu:                   500,
+			payloadSize:           1000,
+			transportHeaderLength: 0,
+			allowPackets:          1,
+			outgoingErrors:        2,
+			mockError:             tcpip.ErrAborted,
+			wantError:             tcpip.ErrAborted,
+		},
+		{
+			description:           "Error on first frag MTU smaller than header",
+			mtu:                   500,
+			transportHeaderLength: 1000,
+			payloadSize:           500,
+			allowPackets:          0,
+			outgoingErrors:        4,
+			mockError:             tcpip.ErrAborted,
+			wantError:             tcpip.ErrAborted,
+		},
+		{
+			description:           "Error when MTU is smaller than IPv4 minimum MTU",
+			mtu:                   header.IPv4MinimumMTU - 1,
+			transportHeaderLength: 0,
+			payloadSize:           500,
+			allowPackets:          0,
+			outgoingErrors:        1,
+			mockError:             nil,
+			wantError:             tcpip.ErrInvalidEndpointState,
+		},
 	}
 
-	for _, ft := range fragTests {
+	for _, ft := range tests {
 		t.Run(ft.description, func(t *testing.T) {
-			pkt := makeRandPkt(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
-			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+			pkt := testutil.MakeRandPkt(ft.transportHeaderLength, extraHeaderReserve+header.IPv4MinimumSize, []int{ft.payloadSize}, header.IPv4ProtocolNumber)
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, ft.mockError, ft.allowPackets)
+			r := buildRoute(t, ep)
+			err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
 				Protocol: tcp.ProtocolNumber,
-				TTL:      42,
+				TTL:      ttl,
 				TOS:      stack.DefaultTOS,
 			}, pkt)
-			for i := 0; i < len(ft.packetCollectorErrors)-1; i++ {
-				if got, want := ft.packetCollectorErrors[i], (*tcpip.Error)(nil); got != want {
-					t.Errorf("ft.packetCollectorErrors[%d] got %v, want %v", i, got, want)
-				}
+			if err != ft.wantError {
+				t.Errorf("got WritePacket(_, _, _) = %s, want = %s", err, ft.wantError)
 			}
-			// We only need to check that last error because all the ones before are
-			// nil.
-			if got, want := err, ft.packetCollectorErrors[len(ft.packetCollectorErrors)-1]; got != want {
-				t.Errorf("err got %v, want %v", got, want)
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != ft.allowPackets {
+				t.Errorf("got r.Stats().IP.PacketsSent.Value() = %d, want = %d", got, ft.allowPackets)
 			}
-			if got, want := c.linkEP.Drain(), int(c.Route.Stats().IP.PacketsSent.Value())+1; err != nil && got != want {
-				t.Errorf("after linkEP error len(result) got %d, want %d", got, want)
+			if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != ft.outgoingErrors {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = %d", got, ft.outgoingErrors)
 			}
 		})
 	}
 }
 
 func TestInvalidFragments(t *testing.T) {
-	// These packets have both IHL and TotalLength set to 0.
-	testCases := []struct {
+	const (
+		nicID    = 1
+		linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+		addr1    = "\x0a\x00\x00\x01"
+		addr2    = "\x0a\x00\x00\x02"
+		tos      = 0
+		ident    = 1
+		ttl      = 48
+		protocol = 6
+	)
+
+	payloadGen := func(payloadLen int) []byte {
+		payload := make([]byte, payloadLen)
+		for i := 0; i < len(payload); i++ {
+			payload[i] = 0x30
+		}
+		return payload
+	}
+
+	type fragmentData struct {
+		ipv4fields   header.IPv4Fields
+		payload      []byte
+		autoChecksum bool // if true, the Checksum field will be overwritten.
+	}
+
+	tests := []struct {
 		name                   string
-		packets                [][]byte
+		fragments              []fragmentData
 		wantMalformedIPPackets uint64
 		wantMalformedFragments uint64
 	}{
 		{
-			"ihl_totallen_zero_valid_frag_offset",
-			[][]byte{
-				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x7d, 0x30, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-			},
-			1,
-			0,
-		},
-		{
-			"ihl_totallen_zero_invalid_frag_offset",
-			[][]byte{
-				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x20, 0x00, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL and TotalLength zero, FragmentOffset non-zero",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            0,
+						TOS:            tos,
+						TotalLength:    0,
+						ID:             ident,
+						Flags:          header.IPv4FlagDontFragment | header.IPv4FlagMoreFragments,
+						FragmentOffset: 59776,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(12),
+					autoChecksum: true,
+				},
 			},
-			1,
-			0,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 0,
 		},
 		{
-			// Total Length of 37(20 bytes IP header + 17 bytes of
-			// payload)
-			// Frag Offset of 0x1ffe = 8190*8 = 65520
-			// Leading to the fragment end to be past 65535.
-			"ihl_totallen_valid_invalid_frag_offset_1",
-			[][]byte{
-				{0x45, 0x30, 0x00, 0x25, 0x6c, 0x74, 0x1f, 0xfe, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL and TotalLength zero, FragmentOffset zero",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            0,
+						TOS:            tos,
+						TotalLength:    0,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(12),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 0,
 		},
-		// The following 3 tests were found by running a fuzzer and were
-		// triggering a panic in the IPv4 reassembler code.
 		{
-			"ihl_less_than_ipv4_minimum_size_1",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0x0, 0xf3, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			// Payload 17 octets and Fragment offset 65520
+			// Leading to the fragment end to be past 65536.
+			name: "fragment ends past 65536",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 17,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 65520,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(17),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 		{
-			"ihl_less_than_ipv4_minimum_size_2",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x12, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			// Payload 16 octets and fragment offset 65520
+			// Leading to the fragment end to be exactly 65536.
+			name: "fragment ends exactly at 65536",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 16,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 65520,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(16),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 0,
+			wantMalformedFragments: 0,
 		},
 		{
-			"ihl_less_than_ipv4_minimum_size_3",
-			[][]byte{
-				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x30, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "IHL less than IPv4 minimum size",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize - 12,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 28,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 1944,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize - 12,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize - 12,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
 			},
-			2,
-			0,
+			wantMalformedIPPackets: 2,
+			wantMalformedFragments: 0,
 		},
 		{
-			"fragment_with_short_total_len_extra_payload",
-			[][]byte{
-				{0x46, 0x30, 0x00, 0x30, 0x30, 0x40, 0x0e, 0x12, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
-				{0x46, 0x30, 0x00, 0x18, 0x30, 0x40, 0x20, 0x00, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			name: "fragment with short TotalLength and extra payload",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize + 4,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 28,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 28816,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize + 4,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 4,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(28),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 		{
-			"multiple_fragments_with_more_fragments_set_to_false",
-			[][]byte{
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x10, 0x00, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x01, 0x61, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x20, 0x00, 0x00, 0x06, 0x34, 0x1e, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			name: "multiple fragments with More Fragments flag set to false",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 128,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 8,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload:      payloadGen(8),
+					autoChecksum: true,
+				},
 			},
-			1,
-			1,
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
 		},
 	}
 
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			const nicID tcpip.NICID = 42
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{
-					ipv4.NewProtocol(),
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					ipv4.NewProtocol,
 				},
 			})
+			e := channel.New(0, 1500, linkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err)
+			}
+
+			for _, f := range test.fragments {
+				pktSize := header.IPv4MinimumSize + len(f.payload)
+				hdr := buffer.NewPrependable(pktSize)
 
-			var linkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30})
-			var remoteLinkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x31})
-			ep := channel.New(10, 1500, linkAddr)
-			s.CreateNIC(nicID, sniffer.New(ep))
+				ip := header.IPv4(hdr.Prepend(pktSize))
+				ip.Encode(&f.ipv4fields)
+				copy(ip[header.IPv4MinimumSize:], f.payload)
+
+				if f.autoChecksum {
+					ip.SetChecksum(0)
+					ip.SetChecksum(^ip.CalculateChecksum())
+				}
 
-			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
-					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
+				vv := hdr.View().ToVectorisedView()
+				e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
 				}))
 			}
 
-			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
+			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), test.wantMalformedIPPackets; got != want {
 				t.Errorf("incorrect Stats.IP.MalformedPacketsReceived, got: %d, want: %d", got, want)
 			}
-			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), tc.wantMalformedFragments; got != want {
+			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), test.wantMalformedFragments; got != want {
 				t.Errorf("incorrect Stats.IP.MalformedFragmentsReceived, got: %d, want: %d", got, want)
 			}
 		})
 	}
 }
 
+func TestFragmentReassemblyTimeout(t *testing.T) {
+	const (
+		nicID    = 1
+		linkAddr = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+		addr1    = "\x0a\x00\x00\x01"
+		addr2    = "\x0a\x00\x00\x02"
+		tos      = 0
+		ident    = 1
+		ttl      = 48
+		protocol = 99
+		data     = "TEST_FRAGMENT_REASSEMBLY_TIMEOUT"
+	)
+
+	type fragmentData struct {
+		ipv4fields header.IPv4Fields
+		payload    []byte
+	}
+
+	tests := []struct {
+		name       string
+		fragments  []fragmentData
+		expectICMP bool
+	}{
+		{
+			name: "first fragment only",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 16,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "two first fragments",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 16,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[:16],
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 16,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "second fragment only",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    uint16(header.IPv4MinimumSize + len(data) - 16),
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 8,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[16:],
+				},
+			},
+			expectICMP: false,
+		},
+		{
+			name: "two fragments with a gap",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[:8],
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    uint16(header.IPv4MinimumSize + len(data) - 16),
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 16,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[16:],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "two fragments with a gap in reverse order",
+			fragments: []fragmentData{
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    uint16(header.IPv4MinimumSize + len(data) - 16),
+						ID:             ident,
+						Flags:          0,
+						FragmentOffset: 16,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[16:],
+				},
+				{
+					ipv4fields: header.IPv4Fields{
+						IHL:            header.IPv4MinimumSize,
+						TOS:            tos,
+						TotalLength:    header.IPv4MinimumSize + 8,
+						ID:             ident,
+						Flags:          header.IPv4FlagMoreFragments,
+						FragmentOffset: 0,
+						TTL:            ttl,
+						Protocol:       protocol,
+						SrcAddr:        addr1,
+						DstAddr:        addr2,
+					},
+					payload: []byte(data)[:8],
+				},
+			},
+			expectICMP: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					ipv4.NewProtocol,
+				},
+				Clock: clock,
+			})
+			e := channel.New(1, 1500, linkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ipv4.ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err)
+			}
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv4EmptySubnet,
+				NIC:         nicID,
+			}})
+
+			var firstFragmentSent buffer.View
+			for _, f := range test.fragments {
+				pktSize := header.IPv4MinimumSize
+				hdr := buffer.NewPrependable(pktSize)
+
+				ip := header.IPv4(hdr.Prepend(pktSize))
+				ip.Encode(&f.ipv4fields)
+
+				ip.SetChecksum(0)
+				ip.SetChecksum(^ip.CalculateChecksum())
+
+				vv := hdr.View().ToVectorisedView()
+				vv.AppendView(f.payload)
+
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
+				})
+
+				if firstFragmentSent == nil && ip.FragmentOffset() == 0 {
+					firstFragmentSent = stack.PayloadSince(pkt.NetworkHeader())
+				}
+
+				e.InjectInbound(header.IPv4ProtocolNumber, pkt)
+			}
+
+			clock.Advance(ipv4.ReassembleTimeout)
+
+			reply, ok := e.Read()
+			if !test.expectICMP {
+				if ok {
+					t.Fatalf("unexpected ICMP error message received: %#v", reply)
+				}
+				return
+			}
+			if !ok {
+				t.Fatal("expected ICMP error message missing")
+			}
+			if firstFragmentSent == nil {
+				t.Fatalf("unexpected ICMP error message received: %#v", reply)
+			}
+
+			checker.IPv4(t, stack.PayloadSince(reply.Pkt.NetworkHeader()),
+				checker.SrcAddr(addr2),
+				checker.DstAddr(addr1),
+				checker.IPFullLength(uint16(header.IPv4MinimumSize+header.ICMPv4MinimumSize+firstFragmentSent.Size())),
+				checker.IPv4HeaderLength(header.IPv4MinimumSize),
+				checker.ICMPv4(
+					checker.ICMPv4Type(header.ICMPv4TimeExceeded),
+					checker.ICMPv4Code(header.ICMPv4ReassemblyTimeout),
+					checker.ICMPv4Checksum(),
+					checker.ICMPv4Payload([]byte(firstFragmentSent)),
+				),
+			)
+		})
+	}
+}
+
 // TestReceiveFragments feeds fragments in through the incoming packet path to
 // test reassembly
 func TestReceiveFragments(t *testing.T) {
@@ -534,6 +1370,9 @@ func TestReceiveFragments(t *testing.T) {
 	// the fragment block size of 8 (RFC 791 section 3.1 page 14).
 	ipv4Payload3Addr1ToAddr2 := udpGen(127, 3, addr1, addr2)
 	udpPayload3Addr1ToAddr2 := ipv4Payload3Addr1ToAddr2[header.UDPMinimumSize:]
+	// Used to test the max reassembled payload length (65,535 octets).
+	ipv4Payload4Addr1ToAddr2 := udpGen(header.UDPMaximumSize-header.UDPMinimumSize, 4, addr1, addr2)
+	udpPayload4Addr1ToAddr2 := ipv4Payload4Addr1ToAddr2[header.UDPMinimumSize:]
 
 	type fragmentData struct {
 		srcAddr        tcpip.Address
@@ -827,14 +1666,36 @@ func TestReceiveFragments(t *testing.T) {
 			},
 			expectedPayloads: nil,
 		},
+		{
+			name: "Two fragments reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload4Addr1ToAddr2[:65512],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 65512,
+					payload:        ipv4Payload4Addr1ToAddr2[65512:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
+		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			// Setup a stack and endpoint.
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, 1280, tcpip.LinkAddress("\xf0\x00"))
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -877,6 +1738,7 @@ func TestReceiveFragments(t *testing.T) {
 					SrcAddr:        frag.srcAddr,
 					DstAddr:        frag.dstAddr,
 				})
+				ip.SetChecksum(^ip.CalculateChecksum())
 
 				vv := hdr.View().ToVectorisedView()
 				vv.AppendView(frag.payload)
@@ -906,3 +1768,394 @@ func TestReceiveFragments(t *testing.T) {
 		})
 	}
 }
+
+func TestWriteStats(t *testing.T) {
+	const nPackets = 3
+
+	tests := []struct {
+		name          string
+		setup         func(*testing.T, *stack.Stack)
+		allowPackets  int
+		expectSent    int
+		expectDropped int
+		expectWritten int
+	}{
+		{
+			name: "Accept all",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets,
+			expectDropped: 0,
+			expectWritten: nPackets,
+		}, {
+			name: "Accept all with error",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  nPackets - 1,
+			expectSent:    nPackets - 1,
+			expectDropped: 0,
+			expectWritten: nPackets - 1,
+		}, {
+			name: "Drop all",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %s", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    0,
+			expectDropped: nPackets,
+			expectWritten: nPackets,
+		}, {
+			name: "Drop some",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule that matches only 1
+				// of the 3 packets.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				// We'll match and DROP the last packet.
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+				// Make sure the next rule is ACCEPT.
+				filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %s", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets - 1,
+			expectDropped: 1,
+			expectWritten: nPackets,
+		},
+	}
+
+	// Parameterize the tests to run with both WritePacket and WritePackets.
+	writers := []struct {
+		name         string
+		writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error)
+	}{
+		{
+			name: "WritePacket",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				nWritten := 0
+				for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+					if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil {
+						return nWritten, err
+					}
+					nWritten++
+				}
+				return nWritten, nil
+			},
+		}, {
+			name: "WritePackets",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{})
+			},
+		},
+	}
+
+	for _, writer := range writers {
+		t.Run(writer.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					ep := testutil.NewMockLinkEndpoint(header.IPv4MinimumMTU, tcpip.ErrInvalidEndpointState, test.allowPackets)
+					rt := buildRoute(t, ep)
+
+					var pkts stack.PacketBufferList
+					for i := 0; i < nPackets; i++ {
+						pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+							ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+							Data:               buffer.NewView(0).ToVectorisedView(),
+						})
+						pkt.TransportHeader().Push(header.UDPMinimumSize)
+						pkts.PushBack(pkt)
+					}
+
+					test.setup(t, rt.Stack())
+
+					nWritten, _ := writer.writePackets(&rt, pkts)
+
+					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
+						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+					}
+					if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped {
+						t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped)
+					}
+					if nWritten != test.expectWritten {
+						t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten)
+					}
+				})
+			}
+		})
+	}
+}
+
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC(1, _) failed: %s", err)
+	}
+	const (
+		src = "\x10\x00\x00\x01"
+		dst = "\x10\x00\x00\x02"
+	)
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, src); err != nil {
+		t.Fatalf("AddAddress(1, %d, %s) failed: %s", ipv4.ProtocolNumber, src, err)
+	}
+	{
+		mask := tcpip.AddressMask(header.IPv4Broadcast)
+		subnet, err := tcpip.NewSubnet(dst, mask)
+		if err != nil {
+			t.Fatalf("NewSubnet(%s, %s) failed: %v", dst, mask, err)
+		}
+		s.SetRouteTable([]tcpip.Route{{
+			Destination: subnet,
+			NIC:         1,
+		}})
+	}
+	rt, err := s.FindRoute(1, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %s, %s, %d, false) = %s", src, dst, ipv4.ProtocolNumber, err)
+	}
+	return rt
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+	limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+	return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+	if lm.limit == 0 {
+		return true, false
+	}
+	lm.limit--
+	return false, false
+}
+
+func TestPacketQueing(t *testing.T) {
+	const nicID = 1
+
+	var (
+		host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1IPv4Addr = tcpip.ProtocolAddress{
+			Protocol: ipv4.ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+				PrefixLen: 24,
+			},
+		}
+		host2IPv4Addr = tcpip.ProtocolAddress{
+			Protocol: ipv4.ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+				PrefixLen: 8,
+			},
+		}
+	)
+
+	tests := []struct {
+		name      string
+		rxPkt     func(*channel.Endpoint)
+		checkResp func(*testing.T, *channel.Endpoint)
+	}{
+		{
+			name: "ICMP Error",
+			rxPkt: func(e *channel.Endpoint) {
+				hdr := buffer.NewPrependable(header.IPv4MinimumSize + header.UDPMinimumSize)
+				u := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+				u.Encode(&header.UDPFields{
+					SrcPort: 5555,
+					DstPort: 80,
+					Length:  header.UDPMinimumSize,
+				})
+				sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, host2IPv4Addr.AddressWithPrefix.Address, host1IPv4Addr.AddressWithPrefix.Address, header.UDPMinimumSize)
+				sum = header.Checksum(header.UDP([]byte{}), sum)
+				u.SetChecksum(^u.CalculateChecksum(sum))
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:         header.IPv4MinimumSize,
+					TotalLength: header.IPv4MinimumSize + header.UDPMinimumSize,
+					TTL:         ipv4.DefaultTTL,
+					Protocol:    uint8(udp.ProtocolNumber),
+					SrcAddr:     host2IPv4Addr.AddressWithPrefix.Address,
+					DstAddr:     host1IPv4Addr.AddressWithPrefix.Address,
+				})
+				ip.SetChecksum(^ip.CalculateChecksum())
+				e.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != header.IPv4ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv4Addr.AddressWithPrefix.Address),
+					checker.ICMPv4(
+						checker.ICMPv4Type(header.ICMPv4DstUnreachable),
+						checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
+			},
+		},
+
+		{
+			name: "Ping",
+			rxPkt: func(e *channel.Endpoint) {
+				totalLen := header.IPv4MinimumSize + header.ICMPv4MinimumSize
+				hdr := buffer.NewPrependable(totalLen)
+				pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+				pkt.SetType(header.ICMPv4Echo)
+				pkt.SetCode(0)
+				pkt.SetChecksum(0)
+				pkt.SetChecksum(^header.Checksum(pkt, 0))
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:         header.IPv4MinimumSize,
+					TotalLength: uint16(totalLen),
+					Protocol:    uint8(icmp.ProtocolNumber4),
+					TTL:         ipv4.DefaultTTL,
+					SrcAddr:     host2IPv4Addr.AddressWithPrefix.Address,
+					DstAddr:     host1IPv4Addr.AddressWithPrefix.Address,
+				})
+				ip.SetChecksum(^ip.CalculateChecksum())
+				e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != header.IPv4ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, header.IPv4ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv4(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv4Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv4Addr.AddressWithPrefix.Address),
+					checker.ICMPv4(
+						checker.ICMPv4Type(header.ICMPv4EchoReply),
+						checker.ICMPv4Code(header.ICMPv4UnusedCode)))
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(1, defaultMTU, host1NICLinkAddr)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := s.AddProtocolAddress(nicID, host1IPv4Addr); err != nil {
+				t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv4Addr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         nicID,
+				},
+			})
+
+			// Receive a packet to trigger link resolution before a response is sent.
+			test.rxPkt(e)
+
+			// Wait for a ARP request since link address resolution should be
+			// performed.
+			{
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != arp.ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, arp.ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != header.EthernetBroadcastAddress {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, header.EthernetBroadcastAddress)
+				}
+				rep := header.ARP(p.Pkt.NetworkHeader().View())
+				if got := rep.Op(); got != header.ARPRequest {
+					t.Errorf("got Op() = %d, want = %d", got, header.ARPRequest)
+				}
+				if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != host1NICLinkAddr {
+					t.Errorf("got HardwareAddressSender = %s, want = %s", got, host1NICLinkAddr)
+				}
+				if got := tcpip.Address(rep.ProtocolAddressSender()); got != host1IPv4Addr.AddressWithPrefix.Address {
+					t.Errorf("got ProtocolAddressSender = %s, want = %s", got, host1IPv4Addr.AddressWithPrefix.Address)
+				}
+				if got := tcpip.Address(rep.ProtocolAddressTarget()); got != host2IPv4Addr.AddressWithPrefix.Address {
+					t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, host2IPv4Addr.AddressWithPrefix.Address)
+				}
+			}
+
+			// Send an ARP reply to complete link address resolution.
+			{
+				hdr := buffer.View(make([]byte, header.ARPSize))
+				packet := header.ARP(hdr)
+				packet.SetIPv4OverEthernet()
+				packet.SetOp(header.ARPReply)
+				copy(packet.HardwareAddressSender(), host2NICLinkAddr)
+				copy(packet.ProtocolAddressSender(), host2IPv4Addr.AddressWithPrefix.Address)
+				copy(packet.HardwareAddressTarget(), host1NICLinkAddr)
+				copy(packet.ProtocolAddressTarget(), host1IPv4Addr.AddressWithPrefix.Address)
+				e.InjectInbound(arp.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.ToVectorisedView(),
+				}))
+			}
+
+			// Expect the response now that the link address has resolved.
+			test.checkResp(t, e)
+
+			// Since link resolution was already performed, it shouldn't be performed
+			// again.
+			test.rxPkt(e)
+			test.checkResp(t, e)
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index bcc64994e..0ac24a6fb 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -5,15 +5,20 @@ package(licenses = ["notice"])
 go_library(
     name = "ipv6",
     srcs = [
+        "dhcpv6configurationfromndpra_string.go",
         "icmp.go",
         "ipv6.go",
+        "ndp.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
     ],
 )
@@ -31,11 +36,14 @@ go_test(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/testutil",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
         "@com_github_google_go_cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/network/ipv6/dhcpv6configurationfromndpra_string.go
index d199ded6a..09ba133b1 100644
--- a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
+++ b/pkg/tcpip/network/ipv6/dhcpv6configurationfromndpra_string.go
@@ -14,7 +14,7 @@
 
 // Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
 
-package stack
+package ipv6
 
 import "strconv"
 
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 66d3a953a..3c15e41a7 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -41,7 +41,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match an address we own.
 	src := hdr.SourceAddress()
-	if e.stack.CheckLocalAddress(e.NICID(), ProtocolNumber, src) == 0 {
+	if e.protocol.stack.CheckLocalAddress(e.nic.ID(), ProtocolNumber, src) == 0 {
 		return
 	}
 
@@ -71,6 +71,59 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack
 	e.dispatcher.DeliverTransportControlPacket(src, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
+// getLinkAddrOption searches NDP options for a given link address option using
+// the provided getAddr function as a filter. Returns the link address if
+// found; otherwise, returns the zero link address value. Also returns true if
+// the options are valid as per the wire format, false otherwise.
+func getLinkAddrOption(it header.NDPOptionIterator, getAddr func(header.NDPOption) tcpip.LinkAddress) (tcpip.LinkAddress, bool) {
+	var linkAddr tcpip.LinkAddress
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			return "", false
+		}
+		if done {
+			break
+		}
+		if addr := getAddr(opt); len(addr) != 0 {
+			// No RFCs define what to do when an NDP message has multiple Link-Layer
+			// Address options. Since no interface can have multiple link-layer
+			// addresses, we consider such messages invalid.
+			if len(linkAddr) != 0 {
+				return "", false
+			}
+			linkAddr = addr
+		}
+	}
+	return linkAddr, true
+}
+
+// getSourceLinkAddr searches NDP options for the source link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getSourceLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if src, ok := opt.(header.NDPSourceLinkLayerAddressOption); ok {
+			return src.EthernetAddress()
+		}
+		return ""
+	})
+}
+
+// getTargetLinkAddr searches NDP options for the target link address option.
+// Returns the link address if found; otherwise, returns the zero link address
+// value. Also returns true if the options are valid as per the wire format,
+// false otherwise.
+func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
+	return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
+		if dst, ok := opt.(header.NDPTargetLinkLayerAddressOption); ok {
+			return dst.EthernetAddress()
+		}
+		return ""
+	})
+}
+
 func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
@@ -117,8 +170,11 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
-		mtu := header.ICMPv6(hdr).MTU()
-		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
+		networkMTU, err := calculateNetworkMTU(header.ICMPv6(hdr).MTU(), header.IPv6MinimumSize)
+		if err != nil {
+			networkMTU = 0
+		}
+		e.handleControl(stack.ControlPacketTooBig, networkMTU, pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
@@ -137,7 +193,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -147,22 +203,16 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// NDP messages cannot be fragmented. Also note that in the common case NDP
 		// datagrams are very small and ToView() will not incur allocations.
 		ns := header.NDPNeighborSolicit(payload.ToView())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NS option, drop the packet.
+		targetAddr := ns.TargetAddress()
+
+		// As per RFC 4861 section 4.3, the Target Address MUST NOT be a multicast
+		// address.
+		if header.IsV6MulticastAddress(targetAddr) {
 			received.Invalid.Increment()
 			return
 		}
 
-		targetAddr := ns.TargetAddress()
-		s := r.Stack()
-		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
-			// We will only get an error if the NIC is unrecognized, which should not
-			// happen. For now, drop this packet.
-			//
-			// TODO(b/141002840): Handle this better?
-			return
-		} else if isTentative {
+		if e.hasTentativeAddr(targetAddr) {
 			// If the target address is tentative and the source of the packet is a
 			// unicast (specified) address, then the source of the packet is
 			// attempting to perform address resolution on the target. In this case,
@@ -175,7 +225,20 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			// stack know so it can handle such a scenario and do nothing further with
 			// the NS.
 			if r.RemoteAddress == header.IPv6Any {
-				s.DupTentativeAddrDetected(e.nicID, targetAddr)
+				// We would get an error if the address no longer exists or the address
+				// is no longer tentative (DAD resolved between the call to
+				// hasTentativeAddr and this point). Both of these are valid scenarios:
+				//   1) An address may be removed at any time.
+				//   2) As per RFC 4862 section 5.4, DAD is not a perfect:
+				//       "Note that the method for detecting duplicates
+				//        is not completely reliable, and it is possible that duplicate
+				//        addresses will still exist"
+				//
+				// TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
+				// address is detected for an assigned address.
+				if err := e.dupTentativeAddrDetected(targetAddr); err != nil && err != tcpip.ErrBadAddress && err != tcpip.ErrInvalidEndpointState {
+					panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
+				}
 			}
 
 			// Do not handle neighbor solicitations targeted to an address that is
@@ -187,48 +250,34 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// so the packet is processed as defined in RFC 4861, as per RFC 4862
 		// section 5.4.3.
 
-		// Is the NS targetting us?
-		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
+		// Is the NS targeting us?
+		if r.Stack().CheckLocalAddress(e.nic.ID(), ProtocolNumber, targetAddr) == 0 {
 			return
 		}
 
-		// If the NS message contains the Source Link-Layer Address option, update
-		// the link address cache with the value of the option.
-		//
-		// TODO(b/148429853): Properly process the NS message and do Neighbor
-		// Unreachability Detection.
 		var sourceLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
+		{
+			it, err := ns.Options().Iter(false /* check */)
 			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
+				// Options are not valid as per the wire format, silently drop the
+				// packet.
+				received.Invalid.Increment()
+				return
 			}
 
-			switch opt := opt.(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				// No RFCs define what to do when an NS message has multiple Source
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(sourceLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				sourceLinkAddr = opt.EthernetAddress()
+			sourceLinkAddr, ok = getSourceLinkAddr(it)
+			if !ok {
+				received.Invalid.Increment()
+				return
 			}
 		}
 
-		unspecifiedSource := r.RemoteAddress == header.IPv6Any
-
 		// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
 		// NOT be included when the source IP address is the unspecified address.
 		// Otherwise, on link layers that have addresses this option MUST be
 		// included in multicast solicitations and SHOULD be included in unicast
 		// solicitations.
+		unspecifiedSource := r.RemoteAddress == header.IPv6Any
 		if len(sourceLinkAddr) == 0 {
 			if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
 				received.Invalid.Increment()
@@ -237,57 +286,88 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		} else if unspecifiedSource {
 			received.Invalid.Increment()
 			return
+		} else if e.nud != nil {
+			e.nud.HandleProbe(r.RemoteAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
 		} else {
-			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
-		}
-
-		// ICMPv6 Neighbor Solicit messages are always sent to
-		// specially crafted IPv6 multicast addresses. As a result, the
-		// route we end up with here has as its LocalAddress such a
-		// multicast address. It would be nonsense to claim that our
-		// source address is a multicast address, so we manually set
-		// the source address to the target address requested in the
-		// solicit message. Since that requires mutating the route, we
-		// must first clone it.
-		r := r.Clone()
-		defer r.Release()
-		r.LocalAddress = targetAddr
+			e.linkAddrCache.AddLinkAddress(e.nic.ID(), r.RemoteAddress, sourceLinkAddr)
+		}
+
+		// As per RFC 4861 section 7.1.1:
+		//   A node MUST silently discard any received Neighbor Solicitation
+		//   messages that do not satisfy all of the following validity checks:
+		//    ...
+		//    - If the IP source address is the unspecified address, the IP
+		//      destination address is a solicited-node multicast address.
+		if unspecifiedSource && !header.IsSolicitedNodeAddr(r.LocalAddress) {
+			received.Invalid.Increment()
+			return
+		}
 
-		// As per RFC 4861 section 7.2.4, if the the source of the solicitation is
-		// the unspecified address, the node MUST set the Solicited flag to zero and
-		// multicast the advertisement to the all-nodes address.
-		solicited := true
+		// As per RFC 4861 section 7.2.4:
+		//
+		//   If the source of the solicitation is the unspecified address, the node
+		//   MUST [...] and multicast the advertisement to the all-nodes address.
+		//
+		remoteAddr := r.RemoteAddress
 		if unspecifiedSource {
-			solicited = false
-			r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+			remoteAddr = header.IPv6AllNodesMulticastAddress
+		}
+
+		// Even if we were able to receive a packet from some remote, we may not
+		// have a route to it - the remote may be blocked via routing rules. We must
+		// always consult our routing table and find a route to the remote before
+		// sending any packet.
+		r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
+		if err != nil {
+			// If we cannot find a route to the destination, silently drop the packet.
+			return
 		}
+		defer r.Release()
 
-		// If the NS has a source link-layer option, use the link address it
-		// specifies as the remote link address for the response instead of the
-		// source link address of the packet.
+		// If the NS has a source link-layer option, resolve the route immediately
+		// to avoid querying the neighbor table when the neighbor entry was updated
+		// as probing the neighbor table for a link address will transition the
+		// entry's state from stale to delay.
+		//
+		// Note, if the source link address is unspecified and this is a unicast
+		// solicitation, we may need to perform neighbor discovery to send the
+		// neighbor advertisement response. This is expected as per RFC 4861 section
+		// 7.2.4:
+		//
+		//   Because unicast Neighbor Solicitations are not required to include a
+		//   Source Link-Layer Address, it is possible that a node sending a
+		//   solicited Neighbor Advertisement does not have a corresponding link-
+		//   layer address for its neighbor in its Neighbor Cache. In such
+		//   situations, a node will first have to use Neighbor Discovery to
+		//   determine the link-layer address of its neighbor (i.e., send out a
+		//   multicast Neighbor Solicitation).
 		//
-		// TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
-		// address cache for the right destination link address instead of manually
-		// patching the route with the remote link address if one is specified in a
-		// Source Link-Layer Address option.
 		if len(sourceLinkAddr) != 0 {
-			r.RemoteLinkAddress = sourceLinkAddr
+			r.ResolveWith(sourceLinkAddr)
 		}
 
 		optsSerializer := header.NDPOptionsSerializer{
-			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+			header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()),
 		}
+		neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length()
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()),
+			ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborAdvertSize,
 		})
-		packet := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
+		pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+		packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
 		packet.SetType(header.ICMPv6NeighborAdvert)
 		na := header.NDPNeighborAdvert(packet.NDPPayload())
-		na.SetSolicitedFlag(solicited)
+
+		// As per RFC 4861 section 7.2.4:
+		//
+		//   If the source of the solicitation is the unspecified address, the node
+		//   MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST
+		//   set the Solicited flag to one and [..].
+		//
+		na.SetSolicitedFlag(!unspecifiedSource)
 		na.SetOverrideFlag(true)
 		na.SetTargetAddress(targetAddr)
-		opts := na.Options()
-		opts.Serialize(optsSerializer)
+		na.Options().Serialize(optsSerializer)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
@@ -304,7 +384,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+		if !isNDPValid() || pkt.Data.Size() < header.ICMPv6NeighborAdvertMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -314,28 +394,34 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// 5, NDP messages cannot be fragmented. Also note that in the common case
 		// NDP datagrams are very small and ToView() will not incur allocations.
 		na := header.NDPNeighborAdvert(payload.ToView())
-		it, err := na.Options().Iter(true)
-		if err != nil {
-			// If we have a malformed NDP NA option, drop the packet.
-			received.Invalid.Increment()
-			return
-		}
-
 		targetAddr := na.TargetAddress()
-		stack := r.Stack()
-
-		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
-			// We will only get an error if the NIC is unrecognized, which should not
-			// happen. For now short-circuit this packet.
-			//
-			// TODO(b/141002840): Handle this better?
-			return
-		} else if isTentative {
+		if e.hasTentativeAddr(targetAddr) {
 			// We just got an NA from a node that owns an address we are performing
 			// DAD on, implying the address is not unique. In this case we let the
 			// stack know so it can handle such a scenario and do nothing furthur with
 			// the NDP NA.
-			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
+			//
+			// We would get an error if the address no longer exists or the address
+			// is no longer tentative (DAD resolved between the call to
+			// hasTentativeAddr and this point). Both of these are valid scenarios:
+			//   1) An address may be removed at any time.
+			//   2) As per RFC 4862 section 5.4, DAD is not a perfect:
+			//       "Note that the method for detecting duplicates
+			//        is not completely reliable, and it is possible that duplicate
+			//        addresses will still exist"
+			//
+			// TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
+			// address is detected for an assigned address.
+			if err := e.dupTentativeAddrDetected(targetAddr); err != nil && err != tcpip.ErrBadAddress && err != tcpip.ErrInvalidEndpointState {
+				panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
+			}
+			return
+		}
+
+		it, err := na.Options().Iter(false /* check */)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
 			return
 		}
 
@@ -348,40 +434,26 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// TODO(b/143147598): Handle the scenario described above. Also inform the
 		// netstack integration that a duplicate address was detected outside of
 		// DAD.
+		targetLinkAddr, ok := getTargetLinkAddr(it)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
 
 		// If the NA message has the target link layer option, update the link
 		// address cache with the link address for the target of the message.
-		//
-		// TODO(b/148429853): Properly process the NA message and do Neighbor
-		// Unreachability Detection.
-		var targetLinkAddr tcpip.LinkAddress
-		for {
-			opt, done, err := it.Next()
-			if err != nil {
-				// This should never happen as Iter(true) above did not return an error.
-				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
-			}
-			if done {
-				break
-			}
-
-			switch opt := opt.(type) {
-			case header.NDPTargetLinkLayerAddressOption:
-				// No RFCs define what to do when an NA message has multiple Target
-				// Link-Layer Address options. Since no interface can have multiple
-				// link-layer addresses, we consider such messages invalid.
-				if len(targetLinkAddr) != 0 {
-					received.Invalid.Increment()
-					return
-				}
-
-				targetLinkAddr = opt.EthernetAddress()
+		if e.nud == nil {
+			if len(targetLinkAddr) != 0 {
+				e.linkAddrCache.AddLinkAddress(e.nic.ID(), targetAddr, targetLinkAddr)
 			}
+			return
 		}
 
-		if len(targetLinkAddr) != 0 {
-			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
-		}
+		e.nud.HandleConfirmation(targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
+			Solicited: na.SolicitedFlag(),
+			Override:  na.OverrideFlag(),
+			IsRouter:  na.RouterFlag(),
+		})
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
@@ -391,8 +463,6 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 
-		remoteLinkAddr := r.RemoteLinkAddress
-
 		// As per RFC 4291 section 2.7, multicast addresses must not be used as
 		// source addresses in IPv6 packets.
 		localAddr := r.LocalAddress
@@ -400,21 +470,19 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			localAddr = ""
 		}
 
-		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		r, err := r.Stack().FindRoute(e.nic.ID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
 		if err != nil {
 			// If we cannot find a route to the destination, silently drop the packet.
 			return
 		}
 		defer r.Release()
 
-		// Use the link address from the source of the original packet.
-		r.ResolveWith(remoteLinkAddr)
-
 		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize,
 			Data:               pkt.Data,
 		})
 		packet := header.ICMPv6(replyPkt.TransportHeader().Push(header.ICMPv6EchoMinimumSize))
+		pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
@@ -440,27 +508,75 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 
 	case header.ICMPv6RouterSolicit:
 		received.RouterSolicit.Increment()
-		if !isNDPValid() {
+
+		//
+		// Validate the RS as per RFC 4861 section 6.1.1.
+		//
+
+		// Is the NDP payload of sufficient size to hold a Router Solictation?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRSMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
 
-	case header.ICMPv6RouterAdvert:
-		received.RouterAdvert.Increment()
+		stack := r.Stack()
+
+		// Is the networking stack operating as a router?
+		if !stack.Forwarding(ProtocolNumber) {
+			// ... No, silently drop the packet.
+			received.RouterOnlyPacketsDroppedByHost.Increment()
+			return
+		}
+
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
+		rs := header.NDPRouterSolicit(payload.ToView())
+		it, err := rs.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 
-		routerAddr := iph.SourceAddress()
+		// If the RS message has the source link layer option, update the link
+		// address cache with the link address for the source of the message.
+		if len(sourceLinkAddr) != 0 {
+			// As per RFC 4861 section 4.1, the Source Link-Layer Address Option MUST
+			// NOT be included when the source IP address is the unspecified address.
+			// Otherwise, it SHOULD be included on link layers that have addresses.
+			if r.RemoteAddress == header.IPv6Any {
+				received.Invalid.Increment()
+				return
+			}
+
+			if e.nud != nil {
+				// A RS with a specified source IP address modifies the NUD state
+				// machine in the same way a reachability probe would.
+				e.nud.HandleProbe(r.RemoteAddress, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+			}
+		}
+
+	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
 
 		//
 		// Validate the RA as per RFC 4861 section 6.1.2.
 		//
 
+		// Is the NDP payload of sufficient size to hold a Router Advertisement?
+		if !isNDPValid() || pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+
+		routerAddr := iph.SourceAddress()
+
 		// Is the IP Source Address a link-local address?
 		if !header.IsV6LinkLocalAddress(routerAddr) {
 			// ...No, silently drop the packet.
@@ -468,16 +584,18 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 			return
 		}
 
-		// The remainder of payload must be only the router advertisement, so
-		// payload.ToView() always returns the advertisement. Per RFC 6980 section
-		// 5, NDP messages cannot be fragmented. Also note that in the common case
-		// NDP datagrams are very small and ToView() will not incur allocations.
+		// Note that in the common case NDP datagrams are very small and ToView()
+		// will not incur allocations.
 		ra := header.NDPRouterAdvert(payload.ToView())
-		opts := ra.Options()
+		it, err := ra.Options().Iter(false /* check */)
+		if err != nil {
+			// Options are not valid as per the wire format, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
 
-		// Are options valid as per the wire format?
-		if _, err := opts.Iter(true); err != nil {
-			// ...No, silently drop the packet.
+		sourceLinkAddr, ok := getSourceLinkAddr(it)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
@@ -487,12 +605,33 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 		// as RFC 4861 section 6.1.2 is concerned.
 		//
 
-		// Tell the NIC to handle the RA.
-		stack := r.Stack()
-		rxNICID := r.NICID()
-		stack.HandleNDPRA(rxNICID, routerAddr, ra)
+		// If the RA has the source link layer option, update the link address
+		// cache with the link address for the advertised router.
+		if len(sourceLinkAddr) != 0 && e.nud != nil {
+			e.nud.HandleProbe(routerAddr, header.IPv6ProtocolNumber, sourceLinkAddr, e.protocol)
+		}
+
+		e.mu.Lock()
+		e.mu.ndp.handleRA(routerAddr, ra)
+		e.mu.Unlock()
 
 	case header.ICMPv6RedirectMsg:
+		// TODO(gvisor.dev/issue/2285): Call `e.nud.HandleProbe` after validating
+		// this redirect message, as per RFC 4871 section 7.3.3:
+		//
+		//    "A Neighbor Cache entry enters the STALE state when created as a
+		//    result of receiving packets other than solicited Neighbor
+		//    Advertisements (i.e., Router Solicitations, Router Advertisements,
+		//    Redirects, and Neighbor Solicitations).  These packets contain the
+		//    link-layer address of either the sender or, in the case of Redirect,
+		//    the redirection target.  However, receipt of these link-layer
+		//    addresses does not confirm reachability of the forward-direction path
+		//    to that node.  Placing a newly created Neighbor Cache entry for which
+		//    the link-layer address is known in the STALE state provides assurance
+		//    that path failures are detected quickly. In addition, should a cached
+		//    link-layer address be modified due to receiving one of the above
+		//    messages, the state SHOULD also be set to STALE to provide prompt
+		//    verification that the path to the new link-layer address is working."
 		received.RedirectMsg.Increment()
 		if !isNDPValid() {
 			received.Invalid.Increment()
@@ -504,18 +643,6 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragme
 	}
 }
 
-const (
-	ndpSolicitedFlag = 1 << 6
-	ndpOverrideFlag  = 1 << 5
-
-	ndpOptSrcLinkAddr = 1
-	ndpOptDstLinkAddr = 2
-
-	icmpV6FlagOffset   = 4
-	icmpV6OptOffset    = 24
-	icmpV6LengthOffset = 25
-)
-
 var _ stack.LinkAddressResolver = (*protocol)(nil)
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.
@@ -524,44 +651,46 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 }
 
 // LinkAddressRequest implements stack.LinkAddressResolver.
-func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
-	snaddr := header.SolicitedNodeAddr(addr)
-
-	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
-	// route here. Note, we would need the nicID to do this properly so the right
-	// NIC (associated to linkEP) is used to send the NDP NS message.
-	r := &stack.Route{
-		LocalAddress:      localAddr,
-		RemoteAddress:     snaddr,
-		RemoteLinkAddress: remoteLinkAddr,
+func (p *protocol) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic stack.NetworkInterface) *tcpip.Error {
+	remoteAddr := targetAddr
+	if len(remoteLinkAddr) == 0 {
+		remoteAddr = header.SolicitedNodeAddr(targetAddr)
+		remoteLinkAddr = header.EthernetAddressFromMulticastIPv6Address(remoteAddr)
 	}
-	if len(r.RemoteLinkAddress) == 0 {
-		r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(snaddr)
+
+	r, err := p.stack.FindRoute(nic.ID(), localAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
 	}
+	defer r.Release()
+	r.ResolveWith(remoteLinkAddr)
 
+	optsSerializer := header.NDPOptionsSerializer{
+		header.NDPSourceLinkLayerAddressOption(nic.LinkAddress()),
+	}
+	neighborSolicitSize := header.ICMPv6NeighborSolicitMinimumSize + optsSerializer.Length()
 	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize,
-	})
-	icmpHdr := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
-	icmpHdr.SetType(header.ICMPv6NeighborSolicit)
-	copy(icmpHdr[icmpV6OptOffset-len(addr):], addr)
-	icmpHdr[icmpV6OptOffset] = ndpOptSrcLinkAddr
-	icmpHdr[icmpV6LengthOffset] = 1
-	copy(icmpHdr[icmpV6LengthOffset+1:], linkEP.LinkAddress())
-	icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
-
-	length := uint16(pkt.Size())
-	ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
-	ip.Encode(&header.IPv6Fields{
-		PayloadLength: length,
-		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-		HopLimit:      header.NDPHopLimit,
-		SrcAddr:       r.LocalAddress,
-		DstAddr:       r.RemoteAddress,
+		ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborSolicitSize,
 	})
+	pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+	packet := header.ICMPv6(pkt.TransportHeader().Push(neighborSolicitSize))
+	packet.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(packet.NDPPayload())
+	ns.SetTargetAddress(targetAddr)
+	ns.Options().Serialize(optsSerializer)
+	packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	stat := p.stack.Stats().ICMP.V6PacketsSent
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: header.ICMPv6ProtocolNumber,
+		TTL:      header.NDPHopLimit,
+	}, pkt); err != nil {
+		stat.Dropped.Increment()
+		return err
+	}
 
-	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+	stat.NeighborSolicit.Increment()
+	return nil
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.
@@ -571,3 +700,192 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 	}
 	return tcpip.LinkAddress([]byte(nil)), false
 }
+
+// ======= ICMP Error packet generation =========
+
+// icmpReason is a marker interface for IPv6 specific ICMP errors.
+type icmpReason interface {
+	isICMPReason()
+}
+
+// icmpReasonParameterProblem is an error during processing of extension headers
+// or the fixed header defined in RFC 4443 section 3.4.
+type icmpReasonParameterProblem struct {
+	code header.ICMPv6Code
+
+	// respondToMulticast indicates that we are sending a packet that falls under
+	// the exception outlined by RFC 4443 section 2.4 point e.3 exception 2:
+	//
+	//       (e.3) A packet destined to an IPv6 multicast address.  (There are
+	//             two exceptions to this rule: (1) the Packet Too Big Message
+	//             (Section 3.2) to allow Path MTU discovery to work for IPv6
+	//             multicast, and (2) the Parameter Problem Message, Code 2
+	//             (Section 3.4) reporting an unrecognized IPv6 option (see
+	//             Section 4.2 of [IPv6]) that has the Option Type highest-
+	//             order two bits set to 10).
+	respondToMulticast bool
+
+	// pointer is defined in the RFC 4443 setion 3.4 which reads:
+	//
+	//  Pointer         Identifies the octet offset within the invoking packet
+	//                  where the error was detected.
+	//
+	//                  The pointer will point beyond the end of the ICMPv6
+	//                  packet if the field in error is beyond what can fit
+	//                  in the maximum size of an ICMPv6 error message.
+	pointer uint32
+}
+
+func (*icmpReasonParameterProblem) isICMPReason() {}
+
+// icmpReasonPortUnreachable is an error where the transport protocol has no
+// listener and no alternative means to inform the sender.
+type icmpReasonPortUnreachable struct{}
+
+func (*icmpReasonPortUnreachable) isICMPReason() {}
+
+// icmpReasonReassemblyTimeout is an error where insufficient fragments are
+// received to complete reassembly of a packet within a configured time after
+// the reception of the first-arriving fragment of that packet.
+type icmpReasonReassemblyTimeout struct{}
+
+func (*icmpReasonReassemblyTimeout) isICMPReason() {}
+
+// returnError takes an error descriptor and generates the appropriate ICMP
+// error packet for IPv6 and sends it.
+func (p *protocol) returnError(r *stack.Route, reason icmpReason, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Only send ICMP error if the address is not a multicast v6
+	// address and the source is not the unspecified address.
+	//
+	// There are exceptions to this rule.
+	// See: point e.3) RFC 4443 section-2.4
+	//
+	//	 (e) An ICMPv6 error message MUST NOT be originated as a result of
+	//       receiving the following:
+	//
+	//       (e.1) An ICMPv6 error message.
+	//
+	//       (e.2) An ICMPv6 redirect message [IPv6-DISC].
+	//
+	//       (e.3) A packet destined to an IPv6 multicast address.  (There are
+	//             two exceptions to this rule: (1) the Packet Too Big Message
+	//             (Section 3.2) to allow Path MTU discovery to work for IPv6
+	//             multicast, and (2) the Parameter Problem Message, Code 2
+	//             (Section 3.4) reporting an unrecognized IPv6 option (see
+	//             Section 4.2 of [IPv6]) that has the Option Type highest-
+	//             order two bits set to 10).
+	//
+	var allowResponseToMulticast bool
+	if reason, ok := reason.(*icmpReasonParameterProblem); ok {
+		allowResponseToMulticast = reason.respondToMulticast
+	}
+
+	if (!allowResponseToMulticast && header.IsV6MulticastAddress(r.LocalAddress)) || r.RemoteAddress == header.IPv6Any {
+		return nil
+	}
+
+	// Even if we were able to receive a packet from some remote, we may not have
+	// a route to it - the remote may be blocked via routing rules. We must always
+	// consult our routing table and find a route to the remote before sending any
+	// packet.
+	route, err := p.stack.FindRoute(r.NICID(), r.LocalAddress, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer route.Release()
+	// From this point on, the incoming route should no longer be used; route
+	// must be used to send the ICMP error.
+	r = nil
+
+	stats := p.stack.Stats().ICMP
+	sent := stats.V6PacketsSent
+	if !p.stack.AllowICMPMessage() {
+		sent.RateLimited.Increment()
+		return nil
+	}
+
+	network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
+
+	if pkt.TransportProtocolNumber == header.ICMPv6ProtocolNumber {
+		// TODO(gvisor.dev/issues/3810): Sort this out when ICMP headers are stored.
+		// Unfortunately at this time ICMP Packets do not have a transport
+		// header separated out. It is in the Data part so we need to
+		// separate it out now. We will just pretend it is a minimal length
+		// ICMP packet as we don't really care if any later bits of a
+		// larger ICMP packet are in the header view or in the Data view.
+		transport, ok := pkt.TransportHeader().Consume(header.ICMPv6MinimumSize)
+		if !ok {
+			return nil
+		}
+		typ := header.ICMPv6(transport).Type()
+		if typ.IsErrorType() || typ == header.ICMPv6RedirectMsg {
+			return nil
+		}
+	}
+
+	// As per RFC 4443 section 2.4
+	//
+	//    (c) Every ICMPv6 error message (type < 128) MUST include
+	//    as much of the IPv6 offending (invoking) packet (the
+	//    packet that caused the error) as possible without making
+	//    the error message packet exceed the minimum IPv6 MTU
+	//    [IPv6].
+	mtu := int(route.MTU())
+	if mtu > header.IPv6MinimumMTU {
+		mtu = header.IPv6MinimumMTU
+	}
+	headerLen := int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize
+	available := int(mtu) - headerLen
+	if available < header.IPv6MinimumSize {
+		return nil
+	}
+	payloadLen := network.Size() + transport.Size() + pkt.Data.Size()
+	if payloadLen > available {
+		payloadLen = available
+	}
+	payload := network.ToVectorisedView()
+	payload.AppendView(transport)
+	payload.Append(pkt.Data)
+	payload.CapLength(payloadLen)
+
+	newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: headerLen,
+		Data:               payload,
+	})
+	newPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
+
+	icmpHdr := header.ICMPv6(newPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
+	var counter *tcpip.StatCounter
+	switch reason := reason.(type) {
+	case *icmpReasonParameterProblem:
+		icmpHdr.SetType(header.ICMPv6ParamProblem)
+		icmpHdr.SetCode(reason.code)
+		icmpHdr.SetTypeSpecific(reason.pointer)
+		counter = sent.ParamProblem
+	case *icmpReasonPortUnreachable:
+		icmpHdr.SetType(header.ICMPv6DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv6PortUnreachable)
+		counter = sent.DstUnreachable
+	case *icmpReasonReassemblyTimeout:
+		icmpHdr.SetType(header.ICMPv6TimeExceeded)
+		icmpHdr.SetCode(header.ICMPv6ReassemblyTimeout)
+		counter = sent.TimeExceeded
+	default:
+		panic(fmt.Sprintf("unsupported ICMP type %T", reason))
+	}
+	icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, route.LocalAddress, route.RemoteAddress, newPkt.Data))
+	if err := route.WritePacket(
+		nil, /* gso */
+		stack.NetworkHeaderParams{
+			Protocol: header.ICMPv6ProtocolNumber,
+			TTL:      route.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		},
+		newPkt,
+	); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	counter.Increment()
+	return nil
+}
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 9e4eeea77..aa8b5f2e5 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -16,40 +16,57 @@ package ipv6
 
 import (
 	"context"
+	"net"
 	"reflect"
 	"strings"
 	"testing"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const (
+	nicID = 1
+
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
 	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
 
 	defaultChannelSize = 1
 	defaultMTU         = 65536
+
+	// Extra time to use when waiting for an async event to occur.
+	defaultAsyncPositiveEventTimeout = 30 * time.Second
 )
 
 var (
 	lladdr0 = header.LinkLocalAddr(linkAddr0)
 	lladdr1 = header.LinkLocalAddr(linkAddr1)
+	lladdr2 = header.LinkLocalAddr(linkAddr2)
 )
 
 type stubLinkEndpoint struct {
 	stack.LinkEndpoint
 }
 
+func (*stubLinkEndpoint) MTU() uint32 {
+	return defaultMTU
+}
+
 func (*stubLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return 0
+	// Indicate that resolution for link layer addresses is required to send
+	// packets over this link. This is needed so the NIC knows to allocate a
+	// neighbor table.
+	return stack.CapabilityResolutionRequired
 }
 
 func (*stubLinkEndpoint) MaxHeaderLength() uint16 {
@@ -70,7 +87,8 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) stack.TransportPacketDisposition {
+	return stack.TransportPacketHandled
 }
 
 type stubLinkAddressCache struct {
@@ -84,16 +102,225 @@ func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtoco
 func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
 }
 
+type stubNUDHandler struct {
+	probeCount        int
+	confirmationCount int
+}
+
+var _ stack.NUDHandler = (*stubNUDHandler)(nil)
+
+func (s *stubNUDHandler) HandleProbe(tcpip.Address, tcpip.NetworkProtocolNumber, tcpip.LinkAddress, stack.LinkAddressResolver) {
+	s.probeCount++
+}
+
+func (s *stubNUDHandler) HandleConfirmation(tcpip.Address, tcpip.LinkAddress, stack.ReachabilityConfirmationFlags) {
+	s.confirmationCount++
+}
+
+func (*stubNUDHandler) HandleUpperLevelConfirmation(tcpip.Address) {
+}
+
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	stack.LinkEndpoint
+
+	nicID tcpip.NICID
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return nicID
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (*testInterface) Enabled() bool {
+	return true
+}
+
+func (t *testInterface) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	r := stack.Route{
+		NetProto:          protocol,
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+	return t.LinkEndpoint.WritePacket(&r, gso, protocol, pkt)
+}
+
 func TestICMPCounts(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+				UseNeighborCache:   test.useNeighborCache,
+			})
+			{
+				if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+					t.Fatalf("CreateNIC(_, _) = %s", err)
+				}
+				if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+					t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+				}
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         nicID,
+					}},
+				)
+			}
+
+			netProto := s.NetworkProtocolInstance(ProtocolNumber)
+			if netProto == nil {
+				t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+			}
+			ep := netProto.NewEndpoint(&testInterface{}, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{})
+			defer ep.Close()
+
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
+			r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+			}
+			defer r.Release()
+
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
+
+			types := []struct {
+				typ       header.ICMPv6Type
+				size      int
+				extraData []byte
+			}{
+				{
+					typ:  header.ICMPv6DstUnreachable,
+					size: header.ICMPv6DstUnreachableMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6PacketTooBig,
+					size: header.ICMPv6PacketTooBigMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6TimeExceeded,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6ParamProblem,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoRequest,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6EchoReply,
+					size: header.ICMPv6EchoMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+				},
+				{
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+				},
+				{
+					typ:  header.ICMPv6NeighborSolicit,
+					size: header.ICMPv6NeighborSolicitMinimumSize,
+				},
+				{
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+				},
+				{
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+				},
+			}
+
+			handleIPv6Payload := func(icmp header.ICMPv6) {
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize,
+					Data:               buffer.View(icmp).ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(icmp)),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				ep.HandlePacket(&r, pkt)
+			}
+
+			for _, typ := range types {
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
+				icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				handleIPv6Payload(icmp)
+			}
+
+			// Construct an empty ICMP packet so that
+			// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
+			handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
+
+			icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+			visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
+				if got, want := s.Value(), uint64(1); got != want {
+					t.Errorf("got %s = %d, want = %d", name, got, want)
+				}
+			})
+			if t.Failed() {
+				t.Logf("stats:\n%+v", s.Stats())
+			}
+		})
+	}
+}
+
+func TestICMPCountsWithNeighborCache(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+		UseNeighborCache:   true,
 	})
 	{
-		if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
-			t.Fatalf("CreateNIC(_) = %s", err)
+		if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+			t.Fatalf("CreateNIC(_, _) = %s", err)
 		}
-		if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+		if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 			t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 		}
 	}
@@ -105,7 +332,7 @@ func TestICMPCounts(t *testing.T) {
 		s.SetRouteTable(
 			[]tcpip.Route{{
 				Destination: subnet,
-				NIC:         1,
+				NIC:         nicID,
 			}},
 		)
 	}
@@ -114,12 +341,16 @@ func TestICMPCounts(t *testing.T) {
 	if netProto == nil {
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	ep := netProto.NewEndpoint(&testInterface{}, nil, &stubNUDHandler{}, &stubDispatcher{})
 	defer ep.Close()
 
-	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
+	r, err := s.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -250,12 +481,12 @@ func (e endpointWithResolutionCapability) Capabilities() stack.LinkEndpointCapab
 func newTestContext(t *testing.T) *testContext {
 	c := &testContext{
 		s0: stack.New(stack.Options{
-			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+			NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+			TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
 		}),
 		s1: stack.New(stack.Options{
-			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+			NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+			TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
 		}),
 	}
 
@@ -265,19 +496,19 @@ func newTestContext(t *testing.T) *testContext {
 	if testing.Verbose() {
 		wrappedEP0 = sniffer.New(wrappedEP0)
 	}
-	if err := c.s0.CreateNIC(1, wrappedEP0); err != nil {
+	if err := c.s0.CreateNIC(nicID, wrappedEP0); err != nil {
 		t.Fatalf("CreateNIC s0: %v", err)
 	}
-	if err := c.s0.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+	if err := c.s0.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 		t.Fatalf("AddAddress lladdr0: %v", err)
 	}
 
 	c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1)
 	wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1})
-	if err := c.s1.CreateNIC(1, wrappedEP1); err != nil {
+	if err := c.s1.CreateNIC(nicID, wrappedEP1); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
-	if err := c.s1.AddAddress(1, ProtocolNumber, lladdr1); err != nil {
+	if err := c.s1.AddAddress(nicID, ProtocolNumber, lladdr1); err != nil {
 		t.Fatalf("AddAddress lladdr1: %v", err)
 	}
 
@@ -288,7 +519,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s0.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet0,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 	subnet1, err := tcpip.NewSubnet(lladdr0, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr0))))
@@ -298,7 +529,7 @@ func newTestContext(t *testing.T) *testContext {
 	c.s1.SetRouteTable(
 		[]tcpip.Route{{
 			Destination: subnet1,
-			NIC:         1,
+			NIC:         nicID,
 		}},
 	)
 
@@ -359,9 +590,9 @@ func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
 
-	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	r, err := c.s0.FindRoute(nicID, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
 	}
 	defer r.Release()
 
@@ -376,14 +607,14 @@ func TestLinkResolution(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.s0.NewEndpoint(header.ICMPv6ProtocolNumber, ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+		t.Fatalf("NewEndpoint(_) = (_, %s), want = (_, nil)", err)
 	}
 
 	for {
-		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}})
+		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: nicID, Addr: lladdr1}})
 		if resCh != nil {
 			if err != tcpip.ErrNoLinkAddress {
-				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+				t.Fatalf("ep.Write(_) = (_, <non-nil>, %s), want = (_, <non-nil>, tcpip.ErrNoLinkAddress)", err)
 			}
 			for _, args := range []routeArgs{
 				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
@@ -399,7 +630,7 @@ func TestLinkResolution(t *testing.T) {
 			continue
 		}
 		if err != nil {
-			t.Fatalf("ep.Write(_) = _, _, %s", err)
+			t.Fatalf("ep.Write(_) = (_, _, %s)", err)
 		}
 		break
 	}
@@ -424,6 +655,7 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		size        int
 		extraData   []byte
 		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		routerOnly  bool
 	}{
 		{
 			name: "DstUnreachable",
@@ -480,6 +712,8 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return stats.RouterSolicit
 			},
+			// Hosts MUST silently discard any received Router Solicitation messages.
+			routerOnly: true,
 		},
 		{
 			name: "RouterAdvert",
@@ -516,84 +750,133 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr0)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
-
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
-			}
-			{
-				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
-				if err != nil {
-					t.Fatal(err)
-				}
-				s.SetRouteTable(
-					[]tcpip.Route{{
-						Destination: subnet,
-						NIC:         1,
-					}},
-				)
-			}
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-			handleIPv6Payload := func(checksum bool) {
-				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-				copy(icmp[typ.size:], typ.extraData)
-				icmp.SetType(typ.typ)
-				if checksum {
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
+					}
+					t.Run(name, func(t *testing.T) {
+						e := channel.New(0, 1280, linkAddr0)
+
+						// Indicate that resolution for link layer addresses is required to
+						// send packets over this link. This is needed so the NIC knows to
+						// allocate a neighbor table.
+						e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+						s := stack.New(stack.Options{
+							NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+							UseNeighborCache: test.useNeighborCache,
+						})
+						if isRouter {
+							// Enabling forwarding makes the stack act as a router.
+							s.SetForwarding(ProtocolNumber, true)
+						}
+						if err := s.CreateNIC(nicID, e); err != nil {
+							t.Fatalf("CreateNIC(_, _) = %s", err)
+						}
+
+						if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+							t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+						}
+						{
+							subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+							if err != nil {
+								t.Fatal(err)
+							}
+							s.SetRouteTable(
+								[]tcpip.Route{{
+									Destination: subnet,
+									NIC:         nicID,
+								}},
+							)
+						}
+
+						handleIPv6Payload := func(checksum bool) {
+							icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+							copy(icmp[typ.size:], typ.extraData)
+							icmp.SetType(typ.typ)
+							if checksum {
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+							}
+							ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
+							ip.Encode(&header.IPv6Fields{
+								PayloadLength: uint16(len(icmp)),
+								NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+								HopLimit:      header.NDPHopLimit,
+								SrcAddr:       lladdr1,
+								DstAddr:       lladdr0,
+							})
+							pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+								Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
+							})
+							e.InjectInbound(ProtocolNumber, pkt)
+						}
+
+						stats := s.Stats().ICMP.V6PacketsReceived
+						invalid := stats.Invalid
+						routerOnly := stats.RouterOnlyPacketsDroppedByHost
+						typStat := typ.statCounter(stats)
+
+						// Initial stat counts should be 0.
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// Without setting checksum, the incoming packet should
+						// be invalid.
+						handleIPv6Payload(false)
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						// Router only count should not have increased.
+						if got := routerOnly.Value(); got != 0 {
+							t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+						}
+						// Rx count of type typ.typ should not have increased.
+						if got := typStat.Value(); got != 0 {
+							t.Fatalf("got %s = %d, want = 0", typ.name, got)
+						}
+
+						// When checksum is set, it should be received.
+						handleIPv6Payload(true)
+						if got := typStat.Value(); got != 1 {
+							t.Fatalf("got %s = %d, want = 1", typ.name, got)
+						}
+						// Invalid count should not have increased again.
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+						if !isRouter && typ.routerOnly && test.useNeighborCache {
+							// Router only count should have increased.
+							if got := routerOnly.Value(); got != 1 {
+								t.Fatalf("got RouterOnlyPacketsReceivedByHost = %d, want = 1", got)
+							}
+						}
+					})
 				}
-				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
-				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(len(icmp)),
-					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-					HopLimit:      header.NDPHopLimit,
-					SrcAddr:       lladdr1,
-					DstAddr:       lladdr0,
-				})
-				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
-				})
-				e.InjectInbound(ProtocolNumber, pkt)
-			}
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			typStat := typ.statCounter(stats)
-
-			// Initial stat counts should be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Without setting checksum, the incoming packet should
-			// be invalid.
-			handleIPv6Payload(false)
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
-			}
-			// Rx count of type typ.typ should not have increased.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// When checksum is set, it should be received.
-			handleIPv6Payload(true)
-			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
-			}
-			// Invalid count should not have increased again.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
 			}
 		})
 	}
@@ -694,13 +977,13 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 		t.Run(typ.name, func(t *testing.T) {
 			e := channel.New(10, 1280, linkAddr0)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(_, _) = %s", err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
 				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
 			}
 			{
@@ -711,7 +994,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -750,7 +1033,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -761,13 +1044,13 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
@@ -872,14 +1155,14 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 		t.Run(typ.name, func(t *testing.T) {
 			e := channel.New(10, 1280, linkAddr0)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
 			}
 			{
 				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
@@ -889,7 +1172,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				s.SetRouteTable(
 					[]tcpip.Route{{
 						Destination: subnet,
-						NIC:         1,
+						NIC:         nicID,
 					}},
 				)
 			}
@@ -929,7 +1212,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// Without setting checksum, the incoming packet should
@@ -940,13 +1223,13 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 			}
 			// Rx count of type typ.typ should not have increased.
 			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 
 			// When checksum is set, it should be received.
 			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
 			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+				t.Fatalf("got = %d, want = 0", got)
 			}
 			// Invalid count should not have increased again.
 			if got := invalid.Value(); got != 1 {
@@ -957,45 +1240,571 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 }
 
 func TestLinkAddressRequest(t *testing.T) {
+	const nicID = 1
+
 	snaddr := header.SolicitedNodeAddr(lladdr0)
 	mcaddr := header.EthernetAddressFromMulticastIPv6Address(snaddr)
 
 	tests := []struct {
 		name           string
+		nicAddr        tcpip.Address
+		localAddr      tcpip.Address
 		remoteLinkAddr tcpip.LinkAddress
-		expectLinkAddr tcpip.LinkAddress
+
+		expectedErr            *tcpip.Error
+		expectedRemoteAddr     tcpip.Address
+		expectedRemoteLinkAddr tcpip.LinkAddress
 	}{
 		{
-			name:           "Unicast",
+			name:                   "Unicast",
+			nicAddr:                lladdr1,
+			localAddr:              lladdr1,
+			remoteLinkAddr:         linkAddr1,
+			expectedRemoteAddr:     lladdr0,
+			expectedRemoteLinkAddr: linkAddr1,
+		},
+		{
+			name:                   "Multicast",
+			nicAddr:                lladdr1,
+			localAddr:              lladdr1,
+			remoteLinkAddr:         "",
+			expectedRemoteAddr:     snaddr,
+			expectedRemoteLinkAddr: mcaddr,
+		},
+		{
+			name:                   "Unicast with unspecified source",
+			nicAddr:                lladdr1,
+			remoteLinkAddr:         linkAddr1,
+			expectedRemoteAddr:     lladdr0,
+			expectedRemoteLinkAddr: linkAddr1,
+		},
+		{
+			name:                   "Multicast with unspecified source",
+			nicAddr:                lladdr1,
+			remoteLinkAddr:         "",
+			expectedRemoteAddr:     snaddr,
+			expectedRemoteLinkAddr: mcaddr,
+		},
+		{
+			name:           "Unicast with unassigned address",
+			localAddr:      lladdr1,
+			remoteLinkAddr: linkAddr1,
+			expectedErr:    tcpip.ErrNetworkUnreachable,
+		},
+		{
+			name:           "Multicast with unassigned address",
+			localAddr:      lladdr1,
+			remoteLinkAddr: "",
+			expectedErr:    tcpip.ErrNetworkUnreachable,
+		},
+		{
+			name:           "Unicast with no local address available",
 			remoteLinkAddr: linkAddr1,
-			expectLinkAddr: linkAddr1,
+			expectedErr:    tcpip.ErrNetworkUnreachable,
 		},
 		{
-			name:           "Multicast",
+			name:           "Multicast with no local address available",
 			remoteLinkAddr: "",
-			expectLinkAddr: mcaddr,
+			expectedErr:    tcpip.ErrNetworkUnreachable,
 		},
 	}
 
 	for _, test := range tests {
-		p := NewProtocol()
+		s := stack.New(stack.Options{
+			NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+		})
+		p := s.NetworkProtocolInstance(ProtocolNumber)
 		linkRes, ok := p.(stack.LinkAddressResolver)
 		if !ok {
 			t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver")
 		}
 
 		linkEP := channel.New(defaultChannelSize, defaultMTU, linkAddr0)
-		if err := linkRes.LinkAddressRequest(lladdr0, lladdr1, test.remoteLinkAddr, linkEP); err != nil {
-			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", lladdr0, lladdr1, test.remoteLinkAddr, err)
+		if err := s.CreateNIC(nicID, linkEP); err != nil {
+			t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+		}
+		if len(test.nicAddr) != 0 {
+			if err := s.AddAddress(nicID, ProtocolNumber, test.nicAddr); err != nil {
+				t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ProtocolNumber, test.nicAddr, err)
+			}
+		}
+
+		// We pass a test network interface to LinkAddressRequest with the same NIC
+		// ID and link endpoint used by the NIC we created earlier so that we can
+		// mock a link address request and observe the packets sent to the link
+		// endpoint even though the stack uses the real NIC.
+		if err := linkRes.LinkAddressRequest(lladdr0, test.localAddr, test.remoteLinkAddr, &testInterface{LinkEndpoint: linkEP, nicID: nicID}); err != test.expectedErr {
+			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s, want = %s", lladdr0, test.localAddr, test.remoteLinkAddr, err, test.expectedErr)
+		}
+
+		if test.expectedErr != nil {
+			return
 		}
 
 		pkt, ok := linkEP.Read()
 		if !ok {
 			t.Fatal("expected to send a link address request")
 		}
+		if pkt.Route.RemoteLinkAddress != test.expectedRemoteLinkAddr {
+			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", pkt.Route.RemoteLinkAddress, test.expectedRemoteLinkAddr)
+		}
+		if pkt.Route.RemoteAddress != test.expectedRemoteAddr {
+			t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, test.expectedRemoteAddr)
+		}
+		if pkt.Route.LocalAddress != lladdr1 {
+			t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, lladdr1)
+		}
+		checker.IPv6(t, stack.PayloadSince(pkt.Pkt.NetworkHeader()),
+			checker.SrcAddr(lladdr1),
+			checker.DstAddr(test.expectedRemoteAddr),
+			checker.TTL(header.NDPHopLimit),
+			checker.NDPNS(
+				checker.NDPNSTargetAddress(lladdr0),
+				checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(linkAddr0)}),
+			))
+	}
+}
 
-		if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want {
-			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want)
+func TestPacketQueing(t *testing.T) {
+	const nicID = 1
+
+	var (
+		host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1IPv6Addr = tcpip.ProtocolAddress{
+			Protocol: ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+				PrefixLen: 64,
+			},
+		}
+		host2IPv6Addr = tcpip.ProtocolAddress{
+			Protocol: ProtocolNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+				PrefixLen: 64,
+			},
 		}
+	)
+
+	tests := []struct {
+		name      string
+		rxPkt     func(*channel.Endpoint)
+		checkResp func(*testing.T, *channel.Endpoint)
+	}{
+		{
+			name: "ICMP Error",
+			rxPkt: func(e *channel.Endpoint) {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.UDPMinimumSize)
+				u := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+				u.Encode(&header.UDPFields{
+					SrcPort: 5555,
+					DstPort: 80,
+					Length:  header.UDPMinimumSize,
+				})
+				sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, header.UDPMinimumSize)
+				sum = header.Checksum(header.UDP([]byte{}), sum)
+				u.SetChecksum(^u.CalculateChecksum(sum))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(udp.ProtocolNumber),
+					HopLimit:      DefaultTTL,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv6Addr.AddressWithPrefix.Address),
+					checker.ICMPv6(
+						checker.ICMPv6Type(header.ICMPv6DstUnreachable),
+						checker.ICMPv6Code(header.ICMPv6PortUnreachable)))
+			},
+		},
+
+		{
+			name: "Ping",
+			rxPkt: func(e *channel.Endpoint) {
+				totalLen := header.IPv6MinimumSize + header.ICMPv6MinimumSize
+				hdr := buffer.NewPrependable(totalLen)
+				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+				pkt.SetType(header.ICMPv6EchoRequest)
+				pkt.SetCode(0)
+				pkt.SetChecksum(0)
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, buffer.VectorisedView{}))
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: header.ICMPv6MinimumSize,
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      DefaultTTL,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			},
+			checkResp: func(t *testing.T, e *channel.Endpoint) {
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got p.Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				if p.Route.RemoteLinkAddress != host2NICLinkAddr {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, host2NICLinkAddr)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(host2IPv6Addr.AddressWithPrefix.Address),
+					checker.ICMPv6(
+						checker.ICMPv6Type(header.ICMPv6EchoReply),
+						checker.ICMPv6Code(header.ICMPv6UnusedCode)))
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+
+			e := channel.New(1, header.IPv6MinimumMTU, host1NICLinkAddr)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, host1IPv6Addr); err != nil {
+				t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, host1IPv6Addr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         nicID,
+				},
+			})
+
+			// Receive a packet to trigger link resolution before a response is sent.
+			test.rxPkt(e)
+
+			// Wait for a neighbor solicitation since link address resolution should
+			// be performed.
+			{
+				p, ok := e.ReadContext(context.Background())
+				if !ok {
+					t.Fatalf("timed out waiting for packet")
+				}
+				if p.Proto != ProtocolNumber {
+					t.Errorf("got Proto = %d, want = %d", p.Proto, ProtocolNumber)
+				}
+				snmc := header.SolicitedNodeAddr(host2IPv6Addr.AddressWithPrefix.Address)
+				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
+					t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				}
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+					checker.SrcAddr(host1IPv6Addr.AddressWithPrefix.Address),
+					checker.DstAddr(snmc),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPNS(
+						checker.NDPNSTargetAddress(host2IPv6Addr.AddressWithPrefix.Address),
+						checker.NDPNSOptions([]header.NDPOption{header.NDPSourceLinkLayerAddressOption(host1NICLinkAddr)}),
+					))
+			}
+
+			// Send a neighbor advertisement to complete link address resolution.
+			{
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
+				pkt := header.ICMPv6(hdr.Prepend(naSize))
+				pkt.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(true)
+				na.SetTargetAddress(host2IPv6Addr.AddressWithPrefix.Address)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(host2NICLinkAddr),
+				})
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, host2IPv6Addr.AddressWithPrefix.Address, host1IPv6Addr.AddressWithPrefix.Address, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					DstAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+				})
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: hdr.View().ToVectorisedView(),
+				}))
+			}
+
+			// Expect the response now that the link address has resolved.
+			test.checkResp(t, e)
+
+			// Since link resolution was already performed, it shouldn't be performed
+			// again.
+			test.rxPkt(e)
+			test.checkResp(t, e)
+		})
+	}
+}
+
+func TestCallsToNeighborCache(t *testing.T) {
+	tests := []struct {
+		name                  string
+		createPacket          func() header.ICMPv6
+		multicast             bool
+		source                tcpip.Address
+		destination           tcpip.Address
+		wantProbeCount        int
+		wantConfirmationCount int
+	}{
+		{
+			name: "Unicast Neighbor Solicitation without source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: lladdr0,
+			// "The source link-layer address option SHOULD be included in unicast
+			//  solicitations." - RFC 4861 section 4.3
+			//
+			// A Neighbor Advertisement needs to be sent in response, but the
+			// Neighbor Cache shouldn't be updated since we have no useful
+			// information about the sender.
+			wantProbeCount: 0,
+		},
+		{
+			name: "Unicast Neighbor Solicitation with source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				ns.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPSourceLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:         lladdr1,
+			destination:    lladdr0,
+			wantProbeCount: 1,
+		},
+		{
+			name: "Multicast Neighbor Solicitation without source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: header.SolicitedNodeAddr(lladdr0),
+			// "The source link-layer address option MUST be included in multicast
+			//  solicitations." - RFC 4861 section 4.3
+			wantProbeCount: 0,
+		},
+		{
+			name: "Multicast Neighbor Solicitation with source link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				nsSize := header.ICMPv6NeighborSolicitMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(nsSize))
+				icmp.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+				ns.SetTargetAddress(lladdr0)
+				ns.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPSourceLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:         lladdr1,
+			destination:    header.SolicitedNodeAddr(lladdr0),
+			wantProbeCount: 1,
+		},
+		{
+			name: "Unicast Neighbor Advertisement without target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: lladdr0,
+			// "When responding to unicast solicitations, the target link-layer
+			//  address option can be omitted since the sender of the solicitation has
+			//  the correct link-layer address; otherwise, it would not be able to
+			//  send the unicast solicitation in the first place."
+			//   - RFC 4861 section 4.4
+			wantConfirmationCount: 1,
+		},
+		{
+			name: "Unicast Neighbor Advertisement with target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:                lladdr1,
+			destination:           lladdr0,
+			wantConfirmationCount: 1,
+		},
+		{
+			name: "Multicast Neighbor Advertisement without target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(false)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				return icmp
+			},
+			source:      lladdr1,
+			destination: header.IPv6AllNodesMulticastAddress,
+			// "Target link-layer address MUST be included for multicast solicitations
+			//  in order to avoid infinite Neighbor Solicitation "recursion" when the
+			//  peer node does not have a cache entry to return a Neighbor
+			//  Advertisements message." - RFC 4861 section 4.4
+			wantConfirmationCount: 0,
+		},
+		{
+			name: "Multicast Neighbor Advertisement with target link-layer address option",
+			createPacket: func() header.ICMPv6 {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				icmp := header.ICMPv6(buffer.NewView(naSize))
+				icmp.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(icmp.NDPPayload())
+				na.SetSolicitedFlag(false)
+				na.SetOverrideFlag(false)
+				na.SetTargetAddress(lladdr1)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
+				return icmp
+			},
+			source:                lladdr1,
+			destination:           header.IPv6AllNodesMulticastAddress,
+			wantConfirmationCount: 1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+				UseNeighborCache:   true,
+			})
+			{
+				if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+					t.Fatalf("CreateNIC(_, _) = %s", err)
+				}
+				if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+					t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+				}
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         nicID,
+					}},
+				)
+			}
+
+			netProto := s.NetworkProtocolInstance(ProtocolNumber)
+			if netProto == nil {
+				t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+			}
+			nudHandler := &stubNUDHandler{}
+			ep := netProto.NewEndpoint(&testInterface{LinkEndpoint: channel.New(0, header.IPv6MinimumMTU, linkAddr0)}, &stubLinkAddressCache{}, nudHandler, &stubDispatcher{})
+			defer ep.Close()
+
+			if err := ep.Enable(); err != nil {
+				t.Fatalf("ep.Enable(): %s", err)
+			}
+
+			r, err := s.FindRoute(nicID, lladdr0, test.source, ProtocolNumber, false /* multicastLoop */)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, _, false) = (_, %s), want = (_, nil)", nicID, lladdr0, lladdr1, err)
+			}
+			defer r.Release()
+
+			// TODO(gvisor.dev/issue/4517): Remove the need for this manual patch.
+			r.LocalAddress = test.destination
+
+			icmp := test.createPacket()
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, r.RemoteAddress, r.LocalAddress, buffer.VectorisedView{}))
+			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+				ReserveHeaderBytes: header.IPv6MinimumSize,
+				Data:               buffer.View(icmp).ToVectorisedView(),
+			})
+			ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(len(icmp)),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      header.NDPHopLimit,
+				SrcAddr:       r.RemoteAddress,
+				DstAddr:       r.LocalAddress,
+			})
+			ep.HandlePacket(&r, pkt)
+
+			// Confirm the endpoint calls the correct NUDHandler method.
+			if nudHandler.probeCount != test.wantProbeCount {
+				t.Errorf("got nudHandler.probeCount = %d, want = %d", nudHandler.probeCount, test.wantProbeCount)
+			}
+			if nudHandler.confirmationCount != test.wantConfirmationCount {
+				t.Errorf("got nudHandler.confirmationCount = %d, want = %d", nudHandler.confirmationCount, test.wantConfirmationCount)
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 0eafe9790..1e38f3a9d 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,45 +12,347 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package ipv6 contains the implementation of the ipv6 network protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing ipv6.NewProtocol() as one of the network
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// ipv6.ProtocolNumber as the network protocol number when calling
-// Stack.NewEndpoint().
+// Package ipv6 contains the implementation of the ipv6 network protocol.
 package ipv6
 
 import (
+	"encoding/binary"
 	"fmt"
+	"hash/fnv"
+	"sort"
 	"sync/atomic"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// As per RFC 8200 section 4.5:
+	//   If insufficient fragments are received to complete reassembly of a packet
+	//   within 60 seconds of the reception of the first-arriving fragment of that
+	//   packet, reassembly of that packet must be abandoned.
+	//
+	// Linux also uses 60 seconds for reassembly timeout:
+	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ipv6.h#L456
+	ReassembleTimeout = 60 * time.Second
+
 	// ProtocolNumber is the ipv6 protocol number.
 	ProtocolNumber = header.IPv6ProtocolNumber
 
-	// maxTotalSize is maximum size that can be encoded in the 16-bit
+	// maxPayloadSize is the maximum size that can be encoded in the 16-bit
 	// PayloadLength field of the ipv6 header.
 	maxPayloadSize = 0xffff
 
 	// DefaultTTL is the default hop limit for IPv6 Packets egressed by
 	// Netstack.
 	DefaultTTL = 64
+
+	// buckets for fragment identifiers
+	buckets = 2048
 )
 
+var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
+var _ stack.AddressableEndpoint = (*endpoint)(nil)
+var _ stack.NetworkEndpoint = (*endpoint)(nil)
+var _ stack.NDPEndpoint = (*endpoint)(nil)
+var _ NDPEndpoint = (*endpoint)(nil)
+
 type endpoint struct {
-	nicID         tcpip.NICID
-	linkEP        stack.LinkEndpoint
+	nic           stack.NetworkInterface
 	linkAddrCache stack.LinkAddressCache
+	nud           stack.NUDHandler
 	dispatcher    stack.TransportDispatcher
 	protocol      *protocol
 	stack         *stack.Stack
+
+	// enabled is set to 1 when the endpoint is enabled and 0 when it is
+	// disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
+	mu struct {
+		sync.RWMutex
+
+		addressableEndpointState stack.AddressableEndpointState
+		ndp                      ndpState
+	}
+}
+
+// NICNameFromID is a function that returns a stable name for the specified NIC,
+// even if different NIC IDs are used to refer to the same NIC in different
+// program runs. It is used when generating opaque interface identifiers (IIDs).
+// If the NIC was created with a name, it is passed to NICNameFromID.
+//
+// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
+// generated for the same prefix on differnt NICs.
+type NICNameFromID func(tcpip.NICID, string) string
+
+// OpaqueInterfaceIdentifierOptions holds the options related to the generation
+// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
+type OpaqueInterfaceIdentifierOptions struct {
+	// NICNameFromID is a function that returns a stable name for a specified NIC,
+	// even if the NIC ID changes over time.
+	//
+	// Must be specified to generate the opaque IID.
+	NICNameFromID NICNameFromID
+
+	// SecretKey is a pseudo-random number used as the secret key when generating
+	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
+	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
+	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
+	// change between program runs, unless explicitly changed.
+	//
+	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
+	// MUST NOT be modified after Stack is created.
+	//
+	// May be nil, but a nil value is highly discouraged to maintain
+	// some level of randomness between nodes.
+	SecretKey []byte
+}
+
+// InvalidateDefaultRouter implements stack.NDPEndpoint.
+func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.mu.ndp.invalidateDefaultRouter(rtr)
+}
+
+// SetNDPConfigurations implements NDPEndpoint.
+func (e *endpoint) SetNDPConfigurations(c NDPConfigurations) {
+	c.validate()
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.mu.ndp.configs = c
+}
+
+// hasTentativeAddr returns true if addr is tentative on e.
+func (e *endpoint) hasTentativeAddr(addr tcpip.Address) bool {
+	e.mu.RLock()
+	addressEndpoint := e.getAddressRLocked(addr)
+	e.mu.RUnlock()
+	return addressEndpoint != nil && addressEndpoint.GetKind() == stack.PermanentTentative
+}
+
+// dupTentativeAddrDetected attempts to inform e that a tentative addr is a
+// duplicate on a link.
+//
+// dupTentativeAddrDetected removes the tentative address if it exists. If the
+// address was generated via SLAAC, an attempt is made to generate a new
+// address.
+func (e *endpoint) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil {
+		return tcpip.ErrBadAddress
+	}
+
+	if addressEndpoint.GetKind() != stack.PermanentTentative {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as an
+	// attempt will be made to generate a new address for it.
+	if err := e.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+		return err
+	}
+
+	prefix := addressEndpoint.AddressWithPrefix().Subnet()
+
+	switch t := addressEndpoint.ConfigType(); t {
+	case stack.AddressConfigStatic:
+	case stack.AddressConfigSlaac:
+		e.mu.ndp.regenerateSLAACAddr(prefix)
+	case stack.AddressConfigSlaacTemp:
+		// Do not reset the generation attempts counter for the prefix as the
+		// temporary address is being regenerated in response to a DAD conflict.
+		e.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
+	default:
+		panic(fmt.Sprintf("unrecognized address config type = %d", t))
+	}
+
+	return nil
+}
+
+// transitionForwarding transitions the endpoint's forwarding status to
+// forwarding.
+//
+// Must only be called when the forwarding status changes.
+func (e *endpoint) transitionForwarding(forwarding bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if !e.Enabled() {
+		return
+	}
+
+	if forwarding {
+		// When transitioning into an IPv6 router, host-only state (NDP discovered
+		// routers, discovered on-link prefixes, and auto-generated addresses) is
+		// cleaned up/invalidated and NDP router solicitations are stopped.
+		e.mu.ndp.stopSolicitingRouters()
+		e.mu.ndp.cleanupState(true /* hostOnly */)
+	} else {
+		// When transitioning into an IPv6 host, NDP router solicitations are
+		// started.
+		e.mu.ndp.startSolicitingRouters()
+	}
+}
+
+// Enable implements stack.NetworkEndpoint.
+func (e *endpoint) Enable() *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// If the NIC is not enabled, the endpoint can't do anything meaningful so
+	// don't enable the endpoint.
+	if !e.nic.Enabled() {
+		return tcpip.ErrNotPermitted
+	}
+
+	// If the endpoint is already enabled, there is nothing for it to do.
+	if !e.setEnabled(true) {
+		return nil
+	}
+
+	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
+	// use IPv6. This is required to ensure that this node properly receives
+	// and responds to the various NDP messages that are destined to the
+	// all-nodes multicast address. An example is the Neighbor Advertisement
+	// when we perform Duplicate Address Detection, or Router Advertisement
+	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
+	// section 4.2 for more information.
+	//
+	// Also auto-generate an IPv6 link-local address based on the endpoint's
+	// link address if it is configured to do so. Note, each interface is
+	// required to have IPv6 link-local unicast address, as per RFC 4291
+	// section 2.1.
+
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if _, err := e.mu.addressableEndpointState.JoinGroup(header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
+	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
+	// state.
+	//
+	// Addresses may have aleady completed DAD but in the time since the endpoint
+	// was last enabled, other devices may have acquired the same addresses.
+	var err *tcpip.Error
+	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+		addr := addressEndpoint.AddressWithPrefix().Address
+		if !header.IsV6UnicastAddress(addr) {
+			return true
+		}
+
+		switch addressEndpoint.GetKind() {
+		case stack.Permanent:
+			addressEndpoint.SetKind(stack.PermanentTentative)
+			fallthrough
+		case stack.PermanentTentative:
+			err = e.mu.ndp.startDuplicateAddressDetection(addr, addressEndpoint)
+			return err == nil
+		default:
+			return true
+		}
+	})
+	if err != nil {
+		return err
+	}
+
+	// Do not auto-generate an IPv6 link-local address for loopback devices.
+	if e.protocol.autoGenIPv6LinkLocal && !e.nic.IsLoopback() {
+		// The valid and preferred lifetime is infinite for the auto-generated
+		// link-local address.
+		e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
+	}
+
+	// If we are operating as a router, then do not solicit routers since we
+	// won't process the RAs anyway.
+	//
+	// Routers do not process Router Advertisements (RA) the same way a host
+	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
+	// and default routers). Therefore, soliciting RAs from other routers on
+	// a link is unnecessary for routers.
+	if !e.protocol.Forwarding() {
+		e.mu.ndp.startSolicitingRouters()
+	}
+
+	return nil
+}
+
+// Enabled implements stack.NetworkEndpoint.
+func (e *endpoint) Enabled() bool {
+	return e.nic.Enabled() && e.isEnabled()
+}
+
+// isEnabled returns true if the endpoint is enabled, regardless of the
+// enabled status of the NIC.
+func (e *endpoint) isEnabled() bool {
+	return atomic.LoadUint32(&e.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the endpoint.
+//
+// Returns true if the enabled status was updated.
+func (e *endpoint) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&e.enabled, 1) == 0
+	}
+	return atomic.SwapUint32(&e.enabled, 0) == 1
+}
+
+// Disable implements stack.NetworkEndpoint.
+func (e *endpoint) Disable() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.disableLocked()
+}
+
+func (e *endpoint) disableLocked() {
+	if !e.setEnabled(false) {
+		return
+	}
+
+	e.mu.ndp.stopSolicitingRouters()
+	e.mu.ndp.cleanupState(false /* hostOnly */)
+	e.stopDADForPermanentAddressesLocked()
+
+	// The endpoint may have already left the multicast group.
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(header.IPv6AllNodesMulticastAddress); err != nil && err != tcpip.ErrBadLocalAddress {
+		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err))
+	}
+}
+
+// stopDADForPermanentAddressesLocked stops DAD for all permaneent addresses.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) stopDADForPermanentAddressesLocked() {
+	// Stop DAD for all the tentative unicast addresses.
+	e.mu.addressableEndpointState.ReadOnly().ForEach(func(addressEndpoint stack.AddressEndpoint) bool {
+		if addressEndpoint.GetKind() != stack.PermanentTentative {
+			return true
+		}
+
+		addr := addressEndpoint.AddressWithPrefix().Address
+		if header.IsV6UnicastAddress(addr) {
+			e.mu.ndp.stopDuplicateAddressDetection(addr)
+		}
+
+		return true
+	})
 }
 
 // DefaultTTL is the default hop limit for this endpoint.
@@ -61,31 +363,17 @@ func (e *endpoint) DefaultTTL() uint8 {
 // MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
 // the network layer max header length.
 func (e *endpoint) MTU() uint32 {
-	return calculateMTU(e.linkEP.MTU())
-}
-
-// NICID returns the ID of the NIC this endpoint belongs to.
-func (e *endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
-// Capabilities implements stack.NetworkEndpoint.Capabilities.
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv6MinimumSize)
+	if err != nil {
+		return 0
+	}
+	return networkMTU
 }
 
 // MaxHeaderLength returns the maximum length needed by ipv6 headers (and
 // underlying protocols).
 func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
-}
-
-// GSOMaxSize returns the maximum GSO packet size.
-func (e *endpoint) GSOMaxSize() uint32 {
-	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
-		return gso.GSOMaxSize()
-	}
-	return 0
+	return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
 }
 
 func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
@@ -99,12 +387,89 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
-	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+	pkt.NetworkProtocolNumber = ProtocolNumber
+}
+
+func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32, gso *stack.GSO) bool {
+	payload := pkt.TransportHeader().View().Size() + pkt.Data.Size()
+	return (gso == nil || gso.Type == stack.GSONone) && uint32(payload) > networkMTU
+}
+
+// handleFragments fragments pkt and calls the handler function on each
+// fragment. It returns the number of fragments handled and the number of
+// fragments left to be processed. The IP header must already be present in the
+// original packet. The transport header protocol number is required to avoid
+// parsing the IPv6 extension headers.
+func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU uint32, pkt *stack.PacketBuffer, transProto tcpip.TransportProtocolNumber, handler func(*stack.PacketBuffer) *tcpip.Error) (int, int, *tcpip.Error) {
+	networkHeader := header.IPv6(pkt.NetworkHeader().View())
+
+	// TODO(gvisor.dev/issue/3912): Once the Authentication or ESP Headers are
+	// supported for outbound packets, their length should not affect the fragment
+	// maximum payload length because they should only be transmitted once.
+	fragmentPayloadLen := (networkMTU - header.IPv6FragmentHeaderSize) &^ 7
+	if fragmentPayloadLen < header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit {
+		// We need at least 8 bytes of space left for the fragmentable part because
+		// the fragment payload must obviously be non-zero and must be a multiple
+		// of 8 as per RFC 8200 section 4.5:
+		//   Each complete fragment, except possibly the last ("rightmost") one, is
+		//   an integer multiple of 8 octets long.
+		return 0, 1, tcpip.ErrMessageTooLong
+	}
+
+	if fragmentPayloadLen < uint32(pkt.TransportHeader().View().Size()) {
+		// As per RFC 8200 Section 4.5, the Transport Header is expected to be small
+		// enough to fit in the first fragment.
+		return 0, 1, tcpip.ErrMessageTooLong
+	}
+
+	pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadLen, calculateFragmentReserve(pkt))
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, e.protocol.hashIV)%buckets], 1)
+
+	var n int
+	for {
+		fragPkt, more := buildNextFragment(&pf, networkHeader, transProto, id)
+		if err := handler(fragPkt); err != nil {
+			return n, pf.RemainingFragmentCount() + 1, err
+		}
+		n++
+		if !more {
+			return n, pf.RemainingFragmentCount(), nil
+		}
+	}
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.addIPHeader(r, pkt, params)
+	return e.writePacket(r, gso, pkt, params.Protocol)
+}
+
+func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber) *tcpip.Error {
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
+		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesOutputDropped.Increment()
+		return nil
+	}
+
+	// If the packet is manipulated as per NAT Output rules, handle packet
+	// based on destination address and do not send the packet to link
+	// layer.
+	//
+	// TODO(gvisor.dev/issue/170): We should do this for every
+	// packet, rather than only NATted packets, but removing this check
+	// short circuits broadcasts before they are sent out to other hosts.
+	if pkt.NatDone {
+		netHeader := header.IPv6(pkt.NetworkHeader().View())
+		if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
+			return nil
+		}
+	}
 
 	if r.Loop&stack.PacketLoop != 0 {
 		loopedR := r.MakeLoopedRoute()
@@ -120,11 +485,35 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		return nil
 	}
 
+	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return err
+	}
+
+	if packetMustBeFragmented(pkt, networkMTU, gso) {
+		sent, remain, err := e.handleFragments(r, gso, networkMTU, pkt, protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
+			// fragment one by one using WritePacket() (current strategy) or if we
+			// want to create a PacketBufferList from the fragments and feed it to
+			// WritePackets(). It'll be faster but cost more memory.
+			return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
+		})
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(sent))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(remain))
+		return err
+	}
+
+	if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return err
+	}
+
 	r.Stats().IP.PacketsSent.Increment()
-	return e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt)
+	return nil
 }
 
-// WritePackets implements stack.LinkEndpoint.WritePackets.
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
@@ -133,31 +522,140 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		return pkts.Len(), nil
 	}
 
+	linkMTU := e.nic.MTU()
 	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
 		e.addIPHeader(r, pb, params)
+
+		networkMTU, err := calculateNetworkMTU(linkMTU, uint32(pb.NetworkHeader().View().Size()))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
+			return 0, err
+		}
+		if packetMustBeFragmented(pb, networkMTU, gso) {
+			// Keep track of the packet that is about to be fragmented so it can be
+			// removed once the fragmentation is done.
+			originalPkt := pb
+			if _, _, err := e.handleFragments(r, gso, networkMTU, pb, params.Protocol, func(fragPkt *stack.PacketBuffer) *tcpip.Error {
+				// Modify the packet list in place with the new fragments.
+				pkts.InsertAfter(pb, fragPkt)
+				pb = fragPkt
+				return nil
+			}); err != nil {
+				r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
+				return 0, err
+			}
+			// Remove the packet that was just fragmented and process the rest.
+			pkts.Remove(originalPkt)
+		}
+	}
+
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	nicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
+	ipt := e.protocol.stack.IPTables()
+	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
+	if len(dropped) == 0 && len(natPkts) == 0 {
+		// Fast path: If no packets are to be dropped then we can just invoke the
+		// faster WritePackets API directly.
+		n, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		if err != nil {
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		}
+		return n, err
+	}
+	r.Stats().IP.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
+
+	// Slow path as we are dropping some packets in the batch degrade to
+	// emitting one packet at a time.
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if _, ok := dropped[pkt]; ok {
+			continue
+		}
+		if _, ok := natPkts[pkt]; ok {
+			netHeader := header.IPv6(pkt.NetworkHeader().View())
+			if ep, err := e.protocol.stack.FindNetworkEndpoint(ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+				src := netHeader.SourceAddress()
+				dst := netHeader.DestinationAddress()
+				route := r.ReverseRoute(src, dst)
+				ep.HandlePacket(&route, pkt)
+				n++
+				continue
+			}
+		}
+		if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+			r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n + len(dropped)))
+			// Dropped packets aren't errors, so include them in
+			// the return value.
+			return n + len(dropped), err
+		}
+		n++
 	}
 
-	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, err
+	// Dropped packets aren't errors, so include them in the return value.
+	return n + len(dropped), nil
 }
 
-// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
-// supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
-	// TODO(b/146666412): Support IPv6 header-included packets.
-	return tcpip.ErrNotSupported
+// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	// The packet already has an IP header, but there are a few required checks.
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return tcpip.ErrMalformedHeader
+	}
+	ip := header.IPv6(h)
+
+	// Always set the payload length.
+	pktSize := pkt.Data.Size()
+	ip.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize))
+
+	// Set the source address when zero.
+	if ip.SourceAddress() == header.IPv6Any {
+		ip.SetSourceAddress(r.LocalAddress)
+	}
+
+	// Set the destination. If the packet already included a destination, it will
+	// be part of the route anyways.
+	ip.SetDestinationAddress(r.RemoteAddress)
+
+	// Populate the packet buffer's network header and don't allow an invalid
+	// packet to be sent.
+	//
+	// Note that parsing only makes sure that the packet is well formed as per the
+	// wire format. We also want to check if the header's fields are valid before
+	// sending the packet.
+	proto, _, _, _, ok := parse.IPv6(pkt)
+	if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) {
+		return tcpip.ErrMalformedHeader
+	}
+
+	return e.writePacket(r, nil /* gso */, pkt, proto)
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	if !e.isEnabled() {
+		return
+	}
+
 	h := header.IPv6(pkt.NetworkHeader().View())
 	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
+	// As per RFC 4291 section 2.7:
+	//   Multicast addresses must not be used as source addresses in IPv6
+	//   packets or appear in any Routing header.
+	if header.IsV6MulticastAddress(r.RemoteAddress) {
+		r.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
+
 	// vv consists of:
 	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
 	// - The transport header, if present.
@@ -168,7 +666,19 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
 	hasFragmentHeader := false
 
-	for firstHeader := true; ; firstHeader = false {
+	// iptables filtering. All packets that reach here are intended for
+	// this machine and need not be forwarded.
+	ipt := e.protocol.stack.IPTables()
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
+		// iptables is telling us to drop the packet.
+		r.Stats().IP.IPTablesInputDropped.Increment()
+		return
+	}
+
+	for {
+		// Keep track of the start of the previous header so we can report the
+		// special case of a Hop by Hop at a location other than at the start.
+		previousHeaderStart := it.HeaderOffset()
 		extHdr, done, err := it.Next()
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -182,11 +692,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 		case header.IPv6HopByHopOptionsExtHdr:
 			// As per RFC 8200 section 4.1, the Hop By Hop extension header is
 			// restricted to appear immediately after an IPv6 fixed header.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1
-			// (unrecognized next header) error in response to an extension header's
-			// Next Header field with the Hop By Hop extension header identifier.
-			if !firstHeader {
+			if previousHeaderStart != 0 {
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6UnknownHeader,
+					pointer: previousHeaderStart,
+				}, pkt)
 				return
 			}
 
@@ -208,13 +718,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				case header.IPv6OptionUnknownActionSkip:
 				case header.IPv6OptionUnknownActionDiscard:
 					return
-				case header.IPv6OptionUnknownActionDiscardSendICMP:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
-					return
 				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
+					if header.IsV6MulticastAddress(r.LocalAddress) {
+						return
+					}
+					fallthrough
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// This case satisfies a requirement of RFC 8200 section 4.2
+					// which states that an unknown option starting with bits [10] should:
+					//
+					//    discard the packet and, regardless of whether or not the
+					//    packet's Destination Address was a multicast address, send an
+					//    ICMP Parameter Problem, Code 2, message to the packet's
+					//    Source Address, pointing to the unrecognized Option Type.
+					//
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:               header.ICMPv6UnknownOption,
+						pointer:            it.ParseOffset() + optsIt.OptionOffset(),
+						respondToMulticast: true,
+					}, pkt)
 					return
 				default:
 					panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt))
@@ -225,16 +747,20 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			// As per RFC 8200 section 4.4, if a node encounters a routing header with
 			// an unrecognized routing type value, with a non-zero Segments Left
 			// value, the node must discard the packet and send an ICMP Parameter
-			// Problem, Code 0. If the Segments Left is 0, the node must ignore the
-			// Routing extension header and process the next header in the packet.
+			// Problem, Code 0 to the packet's Source Address, pointing to the
+			// unrecognized Routing Type.
+			//
+			// If the Segments Left is 0, the node must ignore the Routing extension
+			// header and process the next header in the packet.
 			//
 			// Note, the stack does not yet handle any type of routing extension
 			// header, so we just make sure Segments Left is zero before processing
 			// the next extension header.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 0 for
-			// unrecognized routing types with a non-zero Segments Left value.
 			if extHdr.SegmentsLeft() != 0 {
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6ErroneousHeader,
+					pointer: it.ParseOffset(),
+				}, pkt)
 				return
 			}
 
@@ -251,6 +777,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				continue
 			}
 
+			fragmentFieldOffset := it.ParseOffset()
+
 			// Don't consume the iterator if we have the first fragment because we
 			// will use it to validate that the first fragment holds the upper layer
 			// header.
@@ -267,7 +795,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 					it, done, err := it.Next()
 					if err != nil {
 						r.Stats().IP.MalformedPacketsReceived.Increment()
-						r.Stats().IP.MalformedPacketsReceived.Increment()
+						r.Stats().IP.MalformedFragmentsReceived.Increment()
 						return
 					}
 					if done {
@@ -308,23 +836,62 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				return
 			}
 
+			// As per RFC 2460 Section 4.5:
+			//
+			//    If the length of a fragment, as derived from the fragment packet's
+			//    Payload Length field, is not a multiple of 8 octets and the M flag
+			//    of that fragment is 1, then that fragment must be discarded and an
+			//    ICMP Parameter Problem, Code 0, message should be sent to the source
+			//    of the fragment, pointing to the Payload Length field of the
+			//    fragment packet.
+			if extHdr.More() && fragmentPayloadLen%header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit != 0 {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6ErroneousHeader,
+					pointer: header.IPv6PayloadLenOffset,
+				}, pkt)
+				return
+			}
+
 			// The packet is a fragment, let's try to reassemble it.
 			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
-			last := start + uint16(fragmentPayloadLen) - 1
 
-			// Drop the packet if the fragmentOffset is incorrect. i.e the
-			// combination of fragmentOffset and pkt.Data.size() causes a
-			// wrap around resulting in last being less than the offset.
-			if last < start {
+			// As per RFC 2460 Section 4.5:
+			//
+			//    If the length and offset of a fragment are such that the Payload
+			//    Length of the packet reassembled from that fragment would exceed
+			//    65,535 octets, then that fragment must be discarded and an ICMP
+			//    Parameter Problem, Code 0, message should be sent to the source of
+			//    the fragment, pointing to the Fragment Offset field of the fragment
+			//    packet.
+			if int(start)+fragmentPayloadLen > header.IPv6MaximumPayloadSize {
 				r.Stats().IP.MalformedPacketsReceived.Increment()
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+					code:    header.ICMPv6ErroneousHeader,
+					pointer: fragmentFieldOffset,
+				}, pkt)
 				return
 			}
 
-			var ready bool
+			// Set up a callback in case we need to send a Time Exceeded Message as
+			// per RFC 2460 Section 4.5.
+			var releaseCB func(bool)
+			if start == 0 {
+				pkt := pkt.Clone()
+				r := r.Clone()
+				releaseCB = func(timedOut bool) {
+					if timedOut {
+						_ = e.protocol.returnError(&r, &icmpReasonReassemblyTimeout{}, pkt)
+					}
+					r.Release()
+				}
+			}
+
 			// Note that pkt doesn't have its transport header set after reassembly,
 			// and won't until DeliverNetworkPacket sets it.
-			pkt.Data, ready, err = e.protocol.fragmentation.Process(
+			data, proto, ready, err := e.protocol.fragmentation.Process(
 				// IPv6 ignores the Protocol field since the ID only needs to be unique
 				// across source-destination pairs, as per RFC 8200 section 4.5.
 				fragmentation.FragmentID{
@@ -333,21 +900,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 					ID:          extHdr.ID(),
 				},
 				start,
-				last,
+				start+uint16(fragmentPayloadLen)-1,
 				extHdr.More(),
+				uint8(rawPayload.Identifier),
 				rawPayload.Buf,
+				releaseCB,
 			)
 			if err != nil {
 				r.Stats().IP.MalformedPacketsReceived.Increment()
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
 				return
 			}
+			pkt.Data = data
 
 			if ready {
 				// We create a new iterator with the reassembled packet because we could
 				// have more extension headers in the reassembled payload, as per RFC
-				// 8200 section 4.5.
-				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+				// 8200 section 4.5. We also use the NextHeader value from the first
+				// fragment.
+				it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), pkt.Data)
 			}
 
 		case header.IPv6DestinationOptionsExtHdr:
@@ -369,13 +940,25 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 				case header.IPv6OptionUnknownActionSkip:
 				case header.IPv6OptionUnknownActionDiscard:
 					return
-				case header.IPv6OptionUnknownActionDiscardSendICMP:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
-					return
 				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
-					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
-					// unrecognized IPv6 extension header options.
+					if header.IsV6MulticastAddress(r.LocalAddress) {
+						return
+					}
+					fallthrough
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// This case satisfies a requirement of RFC 8200 section 4.2
+					// which states that an unknown option starting with bits [10] should:
+					//
+					//    discard the packet and, regardless of whether or not the
+					//    packet's Destination Address was a multicast address, send an
+					//    ICMP Parameter Problem, Code 2, message to the packet's
+					//    Source Address, pointing to the unrecognized Option Type.
+					//
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:               header.ICMPv6UnknownOption,
+						pointer:            it.ParseOffset() + optsIt.OptionOffset(),
+						respondToMulticast: true,
+					}, pkt)
 					return
 				default:
 					panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt))
@@ -394,21 +977,55 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 			extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size())
 			pkt.Data = extHdr.Buf
 
+			r.Stats().IP.PacketsDelivered.Increment()
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
+				pkt.TransportProtocolNumber = p
 				e.handleICMP(r, pkt, hasFragmentHeader)
 			} else {
 				r.Stats().IP.PacketsDelivered.Increment()
-				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
-				// in response to unrecognized next header values.
-				e.dispatcher.DeliverTransportPacket(r, p, pkt)
+				switch res := e.dispatcher.DeliverTransportPacket(r, p, pkt); res {
+				case stack.TransportPacketHandled:
+				case stack.TransportPacketDestinationPortUnreachable:
+					// As per RFC 4443 section 3.1:
+					//   A destination node SHOULD originate a Destination Unreachable
+					//   message with Code 4 in response to a packet for which the
+					//   transport protocol (e.g., UDP) has no listener, if that transport
+					//   protocol has no alternative means to inform the sender.
+					_ = e.protocol.returnError(r, &icmpReasonPortUnreachable{}, pkt)
+				case stack.TransportPacketProtocolUnreachable:
+					// As per RFC 8200 section 4. (page 7):
+					//   Extension headers are numbered from IANA IP Protocol Numbers
+					//   [IANA-PN], the same values used for IPv4 and IPv6.  When
+					//   processing a sequence of Next Header values in a packet, the
+					//   first one that is not an extension header [IANA-EH] indicates
+					//   that the next item in the packet is the corresponding upper-layer
+					//   header.
+					// With more related information on page 8:
+					//   If, as a result of processing a header, the destination node is
+					//   required to proceed to the next header but the Next Header value
+					//   in the current header is unrecognized by the node, it should
+					//   discard the packet and send an ICMP Parameter Problem message to
+					//   the source of the packet, with an ICMP Code value of 1
+					//   ("unrecognized Next Header type encountered") and the ICMP
+					//   Pointer field containing the offset of the unrecognized value
+					//   within the original packet.
+					//
+					// Which when taken together indicate that an unknown protocol should
+					// be treated as an unrecognized next header value.
+					_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+						code:    header.ICMPv6UnknownHeader,
+						pointer: it.ParseOffset(),
+					}, pkt)
+				default:
+					panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
+				}
 			}
 
 		default:
-			// If we receive a packet for an extension header we do not yet handle,
-			// drop the packet for now.
-			//
-			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
-			// in response to unrecognized next header values.
+			_ = e.protocol.returnError(r, &icmpReasonParameterProblem{
+				code:    header.ICMPv6UnknownHeader,
+				pointer: it.ParseOffset(),
+			}, pkt)
 			r.Stats().UnknownProtocolRcvdPackets.Increment()
 			return
 		}
@@ -416,19 +1033,343 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 }
 
 // Close cleans up resources associated with the endpoint.
-func (*endpoint) Close() {}
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	e.disableLocked()
+	e.mu.ndp.removeSLAACAddresses(false /* keepLinkLocal */)
+	e.stopDADForPermanentAddressesLocked()
+	e.mu.addressableEndpointState.Cleanup()
+	e.mu.Unlock()
+
+	e.protocol.forgetEndpoint(e)
+}
 
 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return e.protocol.Number()
 }
 
+// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	// TODO(b/169350103): add checks here after making sure we no longer receive
+	// an empty address.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.addAndAcquirePermanentAddressLocked(addr, peb, configType, deprecated)
+}
+
+// addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but
+// with locking requirements.
+//
+// addAndAcquirePermanentAddressLocked also joins the passed address's
+// solicited-node multicast group and start duplicate address detection.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, *tcpip.Error) {
+	addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
+	if err != nil {
+		return nil, err
+	}
+
+	if !header.IsV6UnicastAddress(addr.Address) {
+		return addressEndpoint, nil
+	}
+
+	snmc := header.SolicitedNodeAddr(addr.Address)
+	if _, err := e.mu.addressableEndpointState.JoinGroup(snmc); err != nil {
+		return nil, err
+	}
+
+	addressEndpoint.SetKind(stack.PermanentTentative)
+
+	if e.Enabled() {
+		if err := e.mu.ndp.startDuplicateAddressDetection(addr.Address, addressEndpoint); err != nil {
+			return nil, err
+		}
+	}
+
+	return addressEndpoint, nil
+}
+
+// RemovePermanentAddress implements stack.AddressableEndpoint.
+func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil || !addressEndpoint.GetKind().IsPermanent() {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	return e.removePermanentEndpointLocked(addressEndpoint, true)
+}
+
+// removePermanentEndpointLocked is like removePermanentAddressLocked except
+// it works with a stack.AddressEndpoint.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
+	addr := addressEndpoint.AddressWithPrefix()
+	unicast := header.IsV6UnicastAddress(addr.Address)
+	if unicast {
+		e.mu.ndp.stopDuplicateAddressDetection(addr.Address)
+
+		// If we are removing an address generated via SLAAC, cleanup
+		// its SLAAC resources and notify the integrator.
+		switch addressEndpoint.ConfigType() {
+		case stack.AddressConfigSlaac:
+			e.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		case stack.AddressConfigSlaacTemp:
+			e.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		}
+	}
+
+	if err := e.mu.addressableEndpointState.RemovePermanentEndpoint(addressEndpoint); err != nil {
+		return err
+	}
+
+	if !unicast {
+		return nil
+	}
+
+	snmc := header.SolicitedNodeAddr(addr.Address)
+	if _, err := e.mu.addressableEndpointState.LeaveGroup(snmc); err != nil && err != tcpip.ErrBadLocalAddress {
+		return err
+	}
+
+	return nil
+}
+
+// hasPermanentAddressLocked returns true if the endpoint has a permanent
+// address equal to the passed address.
+//
+// Precondition: e.mu must be read or write locked.
+func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool {
+	addressEndpoint := e.getAddressRLocked(addr)
+	if addressEndpoint == nil {
+		return false
+	}
+	return addressEndpoint.GetKind().IsPermanent()
+}
+
+// getAddressRLocked returns the endpoint for the passed address.
+//
+// Precondition: e.mu must be read or write locked.
+func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint {
+	return e.mu.addressableEndpointState.ReadOnly().Lookup(localAddr)
+}
+
+// MainAddress implements stack.AddressableEndpoint.
+func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.MainAddress()
+}
+
+// AcquireAssignedAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.acquireAddressOrCreateTempLocked(localAddr, allowTemp, tempPEB)
+}
+
+// acquireAddressOrCreateTempLocked is like AcquireAssignedAddress but with
+// locking requirements.
+//
+// Precondition: e.mu must be write locked.
+func (e *endpoint) acquireAddressOrCreateTempLocked(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
+	return e.mu.addressableEndpointState.AcquireAssignedAddress(localAddr, allowTemp, tempPEB)
+}
+
+// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
+func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
+}
+
+// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
+// but with locking requirements.
+//
+// Precondition: e.mu must be read locked.
+func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
+	// addrCandidate is a candidate for Source Address Selection, as per
+	// RFC 6724 section 5.
+	type addrCandidate struct {
+		addressEndpoint stack.AddressEndpoint
+		scope           header.IPv6AddressScope
+	}
+
+	if len(remoteAddr) == 0 {
+		return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
+	}
+
+	// Create a candidate set of available addresses we can potentially use as a
+	// source address.
+	var cs []addrCandidate
+	e.mu.addressableEndpointState.ReadOnly().ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) {
+		// If r is not valid for outgoing connections, it is not a valid endpoint.
+		if !addressEndpoint.IsAssigned(allowExpired) {
+			return
+		}
+
+		addr := addressEndpoint.AddressWithPrefix().Address
+		scope, err := header.ScopeForIPv6Address(addr)
+		if err != nil {
+			// Should never happen as we got r from the primary IPv6 endpoint list and
+			// ScopeForIPv6Address only returns an error if addr is not an IPv6
+			// address.
+			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
+		}
+
+		cs = append(cs, addrCandidate{
+			addressEndpoint: addressEndpoint,
+			scope:           scope,
+		})
+	})
+
+	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
+	if err != nil {
+		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
+		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
+	}
+
+	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
+	//
+	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
+	sort.Slice(cs, func(i, j int) bool {
+		sa := cs[i]
+		sb := cs[j]
+
+		// Prefer same address as per RFC 6724 section 5 rule 1.
+		if sa.addressEndpoint.AddressWithPrefix().Address == remoteAddr {
+			return true
+		}
+		if sb.addressEndpoint.AddressWithPrefix().Address == remoteAddr {
+			return false
+		}
+
+		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
+		if sa.scope < sb.scope {
+			return sa.scope >= remoteScope
+		} else if sb.scope < sa.scope {
+			return sb.scope < remoteScope
+		}
+
+		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
+		if saDep, sbDep := sa.addressEndpoint.Deprecated(), sb.addressEndpoint.Deprecated(); saDep != sbDep {
+			// If sa is not deprecated, it is preferred over sb.
+			return sbDep
+		}
+
+		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
+		if saTemp, sbTemp := sa.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp, sb.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp; saTemp != sbTemp {
+			return saTemp
+		}
+
+		// sa and sb are equal, return the endpoint that is closest to the front of
+		// the primary endpoint list.
+		return i < j
+	})
+
+	// Return the most preferred address that can have its reference count
+	// incremented.
+	for _, c := range cs {
+		if c.addressEndpoint.IncRef() {
+			return c.addressEndpoint
+		}
+	}
+
+	return nil
+}
+
+// PrimaryAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PrimaryAddresses()
+}
+
+// PermanentAddresses implements stack.AddressableEndpoint.
+func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.PermanentAddresses()
+}
+
+// JoinGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) JoinGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	if !header.IsV6MulticastAddress(addr) {
+		return false, tcpip.ErrBadAddress
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.JoinGroup(addr)
+}
+
+// LeaveGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) LeaveGroup(addr tcpip.Address) (bool, *tcpip.Error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.mu.addressableEndpointState.LeaveGroup(addr)
+}
+
+// IsInGroup implements stack.GroupAddressableEndpoint.
+func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return e.mu.addressableEndpointState.IsInGroup(addr)
+}
+
+var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
+var _ stack.NetworkProtocol = (*protocol)(nil)
+
 type protocol struct {
+	stack *stack.Stack
+
+	mu struct {
+		sync.RWMutex
+
+		eps map[*endpoint]struct{}
+	}
+
+	ids    []uint32
+	hashIV uint32
+
 	// defaultTTL is the current default TTL for the protocol. Only the
-	// uint8 portion of it is meaningful and it must be accessed
-	// atomically.
-	defaultTTL    uint32
+	// uint8 portion of it is meaningful.
+	//
+	// Must be accessed using atomic operations.
+	defaultTTL uint32
+
+	// forwarding is set to 1 when the protocol has forwarding enabled and 0
+	// when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	forwarding uint32
+
 	fragmentation *fragmentation.Fragmentation
+
+	// ndpDisp is the NDP event dispatcher that is used to send the netstack
+	// integrator NDP related events.
+	ndpDisp NDPDispatcher
+
+	// ndpConfigs is the default NDP configurations used by an IPv6 endpoint.
+	ndpConfigs NDPConfigurations
+
+	// opaqueIIDOpts hold the options for generating opaque interface identifiers
+	// (IIDs) as outlined by RFC 7217.
+	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// tempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	tempIIDSeed []byte
+
+	// autoGenIPv6LinkLocal determines whether or not the stack attempts to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
+	autoGenIPv6LinkLocal bool
 }
 
 // Number returns the ipv6 protocol number.
@@ -453,22 +1394,42 @@ func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 }
 
 // NewEndpoint creates a new ipv6 endpoint.
-func (p *protocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) stack.NetworkEndpoint {
-	return &endpoint{
-		nicID:         nicID,
-		linkEP:        linkEP,
+func (p *protocol) NewEndpoint(nic stack.NetworkInterface, linkAddrCache stack.LinkAddressCache, nud stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &endpoint{
+		nic:           nic,
 		linkAddrCache: linkAddrCache,
+		nud:           nud,
 		dispatcher:    dispatcher,
 		protocol:      p,
-		stack:         st,
 	}
+	e.mu.addressableEndpointState.Init(e)
+	e.mu.ndp = ndpState{
+		ep:             e,
+		configs:        p.ndpConfigs,
+		dad:            make(map[tcpip.Address]dadState),
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
+	}
+	e.mu.ndp.initializeTempAddrState()
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.mu.eps[e] = struct{}{}
+	return e
+}
+
+func (p *protocol) forgetEndpoint(e *endpoint) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	delete(p.mu.eps, e)
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case tcpip.DefaultTTLOption:
-		p.SetDefaultTTL(uint8(v))
+	case *tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -476,7 +1437,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements NetworkProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
@@ -502,91 +1463,193 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
-	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt)
 	if !ok {
 		return 0, false, false
 	}
-	ipHdr := header.IPv6(hdr)
 
-	// dataClone consists of:
-	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
-	// - The transport header, if present.
-	// - Any other payload data.
-	views := [8]buffer.View{}
-	dataClone := pkt.Data.Clone(views[:])
-	dataClone.TrimFront(header.IPv6MinimumSize)
-	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+	return proto, !fragMore && fragOffset == 0, true
+}
+
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) Forwarding() bool {
+	return uint8(atomic.LoadUint32(&p.forwarding)) == 1
+}
+
+// setForwarding sets the forwarding status for the protocol.
+//
+// Returns true if the forwarding status was updated.
+func (p *protocol) setForwarding(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&p.forwarding, 1) == 0
+	}
+	return atomic.SwapUint32(&p.forwarding, 0) == 1
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (p *protocol) SetForwarding(v bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
 
-	// Iterate over the IPv6 extensions to find their length.
+	if !p.setForwarding(v) {
+		return
+	}
+
+	for ep := range p.mu.eps {
+		ep.transitionForwarding(v)
+	}
+}
+
+// calculateNetworkMTU calculates the network-layer payload MTU based on the
+// link-layer payload MTU and the length of every IPv6 header.
+// Note that this is different than the Payload Length field of the IPv6 header,
+// which includes the length of the extension headers.
+func calculateNetworkMTU(linkMTU, networkHeadersLen uint32) (uint32, *tcpip.Error) {
+	if linkMTU < header.IPv6MinimumMTU {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	// As per RFC 7112 section 5, we should discard packets if their IPv6 header
+	// is bigger than 1280 bytes (ie, the minimum link MTU) since we do not
+	// support PMTU discovery:
+	//   Hosts that do not discover the Path MTU MUST limit the IPv6 Header Chain
+	//   length to 1280 bytes.  Limiting the IPv6 Header Chain length to 1280
+	//   bytes ensures that the header chain length does not exceed the IPv6
+	//   minimum MTU.
+	if networkHeadersLen > header.IPv6MinimumMTU {
+		return 0, tcpip.ErrMalformedHeader
+	}
+
+	networkMTU := linkMTU - uint32(networkHeadersLen)
+	if networkMTU > maxPayloadSize {
+		networkMTU = maxPayloadSize
+	}
+	return networkMTU, nil
+}
+
+// Options holds options to configure a new protocol.
+type Options struct {
+	// NDPConfigs is the default NDP configurations used by interfaces.
+	NDPConfigs NDPConfigurations
+
+	// AutoGenIPv6LinkLocal determines whether or not the stack attempts to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs.
 	//
-	// Parsing occurs again in HandlePacket because we don't track the
-	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
-	// has to do the parsing work again.
-	var nextHdr tcpip.TransportProtocolNumber
-	foundNext := true
-	extensionsSize := 0
-traverseExtensions:
-	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
-		if err != nil {
-			break
-		}
-		// If we exhaust the extension list, the entire packet is the IPv6 header
-		// and (possibly) extensions.
-		if done {
-			extensionsSize = dataClone.Size()
-			foundNext = false
-			break
-		}
+	// Note, setting this to true does not mean that a link-local address is
+	// assigned right away, or at all. If Duplicate Address Detection is enabled,
+	// an address is only assigned if it successfully resolves. If it fails, no
+	// further attempts are made to auto-generate an IPv6 link-local adddress.
+	//
+	// The generated link-local address follows RFC 4291 Appendix A guidelines.
+	AutoGenIPv6LinkLocal bool
 
-		switch extHdr := extHdr.(type) {
-		case header.IPv6FragmentExtHdr:
-			// If this is an atomic fragment, we don't have to treat it specially.
-			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
-				continue
-			}
-			// This is a non-atomic fragment and has to be re-assembled before we can
-			// examine the payload for a transport header.
-			foundNext = false
+	// NDPDisp is the NDP event dispatcher that an integrator can provide to
+	// receive NDP related events.
+	NDPDisp NDPDispatcher
 
-		case header.IPv6RawPayloadHeader:
-			// We've found the payload after any extensions.
-			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
-			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
-			break traverseExtensions
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
+	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
 
-		default:
-			// Any other extension is a no-op, keep looping until we find the payload.
+	// TempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	//
+	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
+	// and random from the perspective of other nodes on the network. It is
+	// recommended that the seed be a random byte buffer of at least
+	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
+	// sufficiently random. It should follow minimum randomness requirements for
+	// security as outlined by RFC 4086.
+	//
+	// Note: using a nil value, the same seed across netstack program runs, or a
+	// seed that is too small would reduce randomness and increase predictability,
+	// defeating the purpose of temporary SLAAC addresses.
+	TempIIDSeed []byte
+}
+
+// NewProtocolWithOptions returns an IPv6 network protocol.
+func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
+	opts.NDPConfigs.validate()
+
+	ids := hash.RandN32(buckets)
+	hashIV := hash.RandN32(1)[0]
+
+	return func(s *stack.Stack) stack.NetworkProtocol {
+		p := &protocol{
+			stack:         s,
+			fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock()),
+			ids:           ids,
+			hashIV:        hashIV,
+
+			ndpDisp:              opts.NDPDisp,
+			ndpConfigs:           opts.NDPConfigs,
+			opaqueIIDOpts:        opts.OpaqueIIDOpts,
+			tempIIDSeed:          opts.TempIIDSeed,
+			autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		}
+		p.mu.eps = make(map[*endpoint]struct{})
+		p.SetDefaultTTL(DefaultTTL)
+		return p
 	}
+}
 
-	// Put the IPv6 header with extensions in pkt.NetworkHeader().
-	hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
-	if !ok {
-		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
-	}
-	ipHdr = header.IPv6(hdr)
-	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
-	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
+func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
+	return NewProtocolWithOptions(Options{})(s)
+}
 
-	return nextHdr, foundNext, true
+func calculateFragmentReserve(pkt *stack.PacketBuffer) int {
+	return pkt.AvailableHeaderBytes() + pkt.NetworkHeader().View().Size() + header.IPv6FragmentHeaderSize
 }
 
-// calculateMTU calculates the network-layer payload MTU based on the link-layer
-// payload mtu.
-func calculateMTU(mtu uint32) uint32 {
-	mtu -= header.IPv6MinimumSize
-	if mtu <= maxPayloadSize {
-		return mtu
+// hashRoute calculates a hash value for the given route. It uses the source &
+// destination address and 32-bit number to generate the hash.
+func hashRoute(r *stack.Route, hashIV uint32) uint32 {
+	// The FNV-1a was chosen because it is a fast hashing algorithm, and
+	// cryptographic properties are not needed here.
+	h := fnv.New32a()
+	if _, err := h.Write([]byte(r.LocalAddress)); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
+	}
+
+	if _, err := h.Write([]byte(r.RemoteAddress)); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
+	}
+
+	s := make([]byte, 4)
+	binary.LittleEndian.PutUint32(s, hashIV)
+	if _, err := h.Write(s); err != nil {
+		panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected ever to return an error", err))
 	}
-	return maxPayloadSize
+
+	return h.Sum32()
 }
 
-// NewProtocol returns an IPv6 network protocol.
-func NewProtocol() stack.NetworkProtocol {
-	return &protocol{
-		defaultTTL:    DefaultTTL,
-		fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders header.IPv6, transportProto tcpip.TransportProtocolNumber, id uint32) (*stack.PacketBuffer, bool) {
+	fragPkt, offset, copied, more := pf.BuildNextFragment()
+	fragPkt.NetworkProtocolNumber = ProtocolNumber
+
+	originalIPHeadersLength := len(originalIPHeaders)
+	fragmentIPHeadersLength := originalIPHeadersLength + header.IPv6FragmentHeaderSize
+	fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength))
+
+	// Copy the IPv6 header and any extension headers already populated.
+	if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength {
+		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength))
 	}
+	fragmentIPHeaders.SetNextHeader(header.IPv6FragmentHeader)
+	fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize))
+
+	fragmentHeader := header.IPv6Fragment(fragmentIPHeaders[originalIPHeadersLength:])
+	fragmentHeader.Encode(&header.IPv6FragmentFields{
+		M:              more,
+		FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit),
+		Identification: id,
+		NextHeader:     uint8(transportProto),
+	})
+
+	return fragPkt, more
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 0a183bfde..c593c0004 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -15,15 +15,22 @@
 package ipv6
 
 import (
+	"encoding/hex"
+	"fmt"
+	"math"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/testutil"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -43,6 +50,8 @@ const (
 	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
 	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
 	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
+
+	extraHeaderReserve = 50
 )
 
 // testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -51,8 +60,8 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 	t.Helper()
 
 	// Receive ICMP packet.
-	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
-	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertMinimumSize))
 	pkt.SetType(header.ICMPv6NeighborAdvert)
 	pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, dst, buffer.VectorisedView{}))
 	payloadLength := hdr.UsedLength()
@@ -134,25 +143,103 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 	}
 }
 
+func compareFragments(packets []*stack.PacketBuffer, sourcePacket *stack.PacketBuffer, mtu uint32, wantFragments []fragmentInfo, proto tcpip.TransportProtocolNumber) error {
+	// sourcePacket does not have its IP Header populated. Let's copy the one
+	// from the first fragment.
+	source := header.IPv6(packets[0].NetworkHeader().View())
+	sourceIPHeadersLen := len(source)
+	vv := buffer.NewVectorisedView(sourcePacket.Size(), sourcePacket.Views())
+	source = append(source, vv.ToView()...)
+
+	var reassembledPayload buffer.VectorisedView
+	for i, fragment := range packets {
+		// Confirm that the packet is valid.
+		allBytes := buffer.NewVectorisedView(fragment.Size(), fragment.Views())
+		fragmentIPHeaders := header.IPv6(allBytes.ToView())
+		if !fragmentIPHeaders.IsValid(len(fragmentIPHeaders)) {
+			return fmt.Errorf("fragment #%d: IP packet is invalid:\n%s", i, hex.Dump(fragmentIPHeaders))
+		}
+
+		fragmentIPHeadersLength := fragment.NetworkHeader().View().Size()
+		if fragmentIPHeadersLength != sourceIPHeadersLen {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeadersLength = %d, want = %d", i, fragmentIPHeadersLength, sourceIPHeadersLen)
+		}
+
+		if got := len(fragmentIPHeaders); got > int(mtu) {
+			return fmt.Errorf("fragment #%d: got len(fragmentIPHeaders) = %d, want <= %d", i, got, mtu)
+		}
+
+		sourceIPHeader := source[:header.IPv6MinimumSize]
+		fragmentIPHeader := fragmentIPHeaders[:header.IPv6MinimumSize]
+
+		if got := fragmentIPHeaders.PayloadLength(); got != wantFragments[i].payloadSize {
+			return fmt.Errorf("fragment #%d: got fragmentIPHeaders.PayloadLength() = %d, want = %d", i, got, wantFragments[i].payloadSize)
+		}
+
+		// We expect the IPv6 Header to be similar across each fragment, besides the
+		// payload length.
+		sourceIPHeader.SetPayloadLength(0)
+		fragmentIPHeader.SetPayloadLength(0)
+		if diff := cmp.Diff(fragmentIPHeader, sourceIPHeader); diff != "" {
+			return fmt.Errorf("fragment #%d: fragmentIPHeader mismatch (-want +got):\n%s", i, diff)
+		}
+
+		if got := fragment.AvailableHeaderBytes(); got != extraHeaderReserve {
+			return fmt.Errorf("fragment #%d: got packet.AvailableHeaderBytes() = %d, want = %d", i, got, extraHeaderReserve)
+		}
+		if fragment.NetworkProtocolNumber != sourcePacket.NetworkProtocolNumber {
+			return fmt.Errorf("fragment #%d: got fragment.NetworkProtocolNumber = %d, want = %d", i, fragment.NetworkProtocolNumber, sourcePacket.NetworkProtocolNumber)
+		}
+
+		if len(packets) > 1 {
+			// If the source packet was big enough that it needed fragmentation, let's
+			// inspect the fragment header. Because no other extension headers are
+			// supported, it will always be the last extension header.
+			fragmentHeader := header.IPv6Fragment(fragmentIPHeaders[fragmentIPHeadersLength-header.IPv6FragmentHeaderSize : fragmentIPHeadersLength])
+
+			if got := fragmentHeader.More(); got != wantFragments[i].more {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.More() = %t, want = %t", i, got, wantFragments[i].more)
+			}
+			if got := fragmentHeader.FragmentOffset(); got != wantFragments[i].offset {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.FragmentOffset() = %d, want = %d", i, got, wantFragments[i].offset)
+			}
+			if got := fragmentHeader.NextHeader(); got != uint8(proto) {
+				return fmt.Errorf("fragment #%d: got fragmentHeader.NextHeader() = %d, want = %d", i, got, uint8(proto))
+			}
+		}
+
+		// Store the reassembled payload as we parse each fragment. The payload
+		// includes the Transport header and everything after.
+		reassembledPayload.AppendView(fragment.TransportHeader().View())
+		reassembledPayload.Append(fragment.Data)
+	}
+
+	if diff := cmp.Diff(buffer.View(source[sourceIPHeadersLen:]), reassembledPayload.ToView()); diff != "" {
+		return fmt.Errorf("reassembledPayload mismatch (-want +got):\n%s", diff)
+	}
+
+	return nil
+}
+
 // TestReceiveOnAllNodesMulticastAddr tests that IPv6 endpoints receive ICMP and
 // UDP packets destined to the IPv6 link-local all-nodes multicast address.
 func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 	tests := []struct {
 		name            string
-		protocolFactory stack.TransportProtocol
+		protocolFactory stack.TransportProtocolFactory
 		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
 	}{
-		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
-		{"UDP", udp.NewProtocol(), testReceiveUDP},
+		{"ICMP", icmp.NewProtocol6, testReceiveICMP},
+		{"UDP", udp.NewProtocol, testReceiveUDP},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{test.protocolFactory},
 			})
-			e := channel.New(10, 1280, linkAddr1)
+			e := channel.New(10, header.IPv6MinimumMTU, linkAddr1)
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(_) = %s", err)
 			}
@@ -168,15 +255,13 @@ func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 // packets destined to the IPv6 solicited-node address of an assigned IPv6
 // address.
 func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
 		name            string
-		protocolFactory stack.TransportProtocol
+		protocolFactory stack.TransportProtocolFactory
 		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
 	}{
-		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
-		{"UDP", udp.NewProtocol(), testReceiveUDP},
+		{"ICMP", icmp.NewProtocol6, testReceiveICMP},
+		{"UDP", udp.NewProtocol, testReceiveUDP},
 	}
 
 	snmc := header.SolicitedNodeAddr(addr2)
@@ -184,16 +269,16 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{test.protocolFactory},
 			})
-			e := channel.New(1, 1280, linkAddr1)
+			e := channel.New(1, header.IPv6MinimumMTU, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
 			s.SetRouteTable([]tcpip.Route{
-				tcpip.Route{
+				{
 					Destination: header.IPv6EmptySubnet,
 					NIC:         nicID,
 				},
@@ -271,7 +356,7 @@ func TestAddIpv6Address(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
 				t.Fatalf("CreateNIC(_) = %s", err)
@@ -293,17 +378,22 @@ func TestAddIpv6Address(t *testing.T) {
 }
 
 func TestReceiveIPv6ExtHdrs(t *testing.T) {
-	const nicID = 1
-
 	tests := []struct {
 		name         string
 		extHdr       func(nextHdr uint8) ([]byte, uint8)
 		shouldAccept bool
+		// Should we expect an ICMP response and if so, with what contents?
+		expectICMP bool
+		ICMPType   header.ICMPv6Type
+		ICMPCode   header.ICMPv6Code
+		pointer    uint32
+		multicast  bool
 	}{
 		{
 			name:         "None",
 			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
 			name: "hopbyhop with unknown option skippable action",
@@ -334,9 +424,10 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name: "hopbyhop with unknown option discard and send icmp action",
+			name: "hopbyhop with unknown option discard and send icmp action (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -346,12 +437,38 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 
 					// Discard & send ICMP if option is unknown.
 					191, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest",
+			name: "hopbyhop with unknown option discard and send icmp action (multicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
+				}, hopByHopExtHdrID
+			},
+			multicast:    true,
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -362,39 +479,97 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					// Discard & send ICMP unless packet is for multicast destination if
 					// option is unknown.
 					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
 				}, hopByHopExtHdrID
 			},
+			expectICMP: true,
+			ICMPType:   header.ICMPv6ParamProblem,
+			ICMPCode:   header.ICMPv6UnknownOption,
+			pointer:    header.IPv6FixedHeaderSize + 8,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest (multicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ Unknown option.
+				}, hopByHopExtHdrID
+			},
+			multicast:    true,
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name:         "routing with zero segments left",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
+			name: "routing with zero segments left",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 0, 2, 3, 4, 5,
+				}, routingExtHdrID
+			},
 			shouldAccept: true,
 		},
 		{
-			name:         "routing with non-zero segments left",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 1, 2, 3, 4, 5}, routingExtHdrID },
+			name: "routing with non-zero segments left",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 1, 2, 3, 4, 5,
+				}, routingExtHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6ErroneousHeader,
+			pointer:      header.IPv6FixedHeaderSize + 2,
 		},
 		{
-			name:         "atomic fragment with zero ID",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 0, 0, 0, 0}, fragmentExtHdrID },
+			name: "atomic fragment with zero ID",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					0, 0, 0, 0, 0, 0,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: true,
 		},
 		{
-			name:         "atomic fragment with non-zero ID",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			name: "atomic fragment with non-zero ID",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					0, 0, 1, 2, 3, 4,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
-			name:         "fragment",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			name: "fragment",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 0,
+					1, 0, 1, 2, 3, 4,
+				}, fragmentExtHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
-			name:         "No next header",
-			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			name: "No next header",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{},
+					noNextHdrID
+			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
 			name: "destination with unknown option skippable action",
@@ -410,6 +585,7 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, destinationExtHdrID
 			},
 			shouldAccept: true,
+			expectICMP:   false,
 		},
 		{
 			name: "destination with unknown option discard action",
@@ -425,9 +601,30 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, destinationExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action (unicast)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+					//^  191 is an unknown option.
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "destination with unknown option discard and send icmp action",
+			name: "destination with unknown option discard and send icmp action (muilticast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -437,12 +634,18 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 
 					// Discard & send ICMP if option is unknown.
 					191, 6, 1, 2, 3, 4, 5, 6,
+					//^  191 is an unknown option.
 				}, destinationExtHdrID
 			},
+			multicast:    true,
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "destination with unknown option discard and send icmp action unless multicast dest",
+			name: "destination with unknown option discard and send icmp action unless multicast dest (unicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
 					nextHdr, 1,
@@ -453,22 +656,33 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					// Discard & send ICMP unless packet is for multicast destination if
 					// option is unknown.
 					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ 255 is unknown.
 				}, destinationExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownOption,
+			pointer:      header.IPv6FixedHeaderSize + 8,
 		},
 		{
-			name: "routing - atomic fragment",
+			name: "destination with unknown option discard and send icmp action unless multicast dest (multicast)",
 			extHdr: func(nextHdr uint8) ([]byte, uint8) {
 				return []byte{
-					// Routing extension header.
-					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					nextHdr, 1,
 
-					// Fragment extension header.
-					nextHdr, 0, 0, 0, 1, 2, 3, 4,
-				}, routingExtHdrID
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+					//^ 255 is unknown.
+				}, destinationExtHdrID
 			},
-			shouldAccept: true,
+			shouldAccept: false,
+			expectICMP:   false,
+			multicast:    true,
 		},
 		{
 			name: "atomic fragment - routing",
@@ -502,12 +716,42 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				return []byte{
 					// Routing extension header.
 					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					// ^^^   The HopByHop extension header may not appear after the first
+					// extension header.
 
 					// Hop By Hop extension header with skippable unknown option.
 					nextHdr, 0, 62, 4, 1, 2, 3, 4,
 				}, routingExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6FixedHeaderSize,
+		},
+		{
+			name: "routing - hop by hop (with send icmp unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+					// ^^^   The HopByHop extension header may not appear after the first
+					// extension header.
+
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, routingExtHdrID
+			},
+			shouldAccept: false,
+			expectICMP:   true,
+			ICMPType:     header.ICMPv6ParamProblem,
+			ICMPCode:     header.ICMPv6UnknownHeader,
+			pointer:      header.IPv6FixedHeaderSize,
 		},
 		{
 			name:         "No next header",
@@ -551,6 +795,7 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 		{
 			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with discard unknown)",
@@ -571,16 +816,17 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				}, hopByHopExtHdrID
 			},
 			shouldAccept: false,
+			expectICMP:   false,
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
-			e := channel.New(0, 1280, linkAddr1)
+			e := channel.New(1, header.IPv6MinimumMTU, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -588,6 +834,14 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
 			}
 
+			// Add a default route so that a return packet knows where to go.
+			s.SetRouteTable([]tcpip.Route{
+				{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
 			wq := waiter.Queue{}
 			we, ch := waiter.NewChannelEntry(nil)
 			wq.EventRegister(&we, waiter.EventIn)
@@ -629,12 +883,16 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 			// Serialize IPv6 fixed header.
 			payloadLength := hdr.UsedLength()
 			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			dstAddr := tcpip.Address(addr2)
+			if test.multicast {
+				dstAddr = header.IPv6AllNodesMulticastAddress
+			}
 			ip.Encode(&header.IPv6Fields{
 				PayloadLength: uint16(payloadLength),
 				NextHeader:    ipv6NextHdr,
 				HopLimit:      255,
 				SrcAddr:       addr1,
-				DstAddr:       addr2,
+				DstAddr:       dstAddr,
 			})
 
 			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
@@ -648,6 +906,44 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 					t.Errorf("got UDP Rx Packets = %d, want = 0", got)
 				}
 
+				if !test.expectICMP {
+					if p, ok := e.Read(); ok {
+						t.Fatalf("unexpected packet received: %#v", p)
+					}
+					return
+				}
+
+				// ICMP required.
+				p, ok := e.Read()
+				if !ok {
+					t.Fatalf("expected packet wasn't written out")
+				}
+
+				// Pack the output packet into a single buffer.View as the checkers
+				// assume that.
+				vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+				pkt := vv.ToView()
+				if got, want := len(pkt), header.IPv6FixedHeaderSize+header.ICMPv6MinimumSize+hdr.UsedLength(); got != want {
+					t.Fatalf("got an ICMP packet of size = %d, want = %d", got, want)
+				}
+
+				ipHdr := header.IPv6(pkt)
+				checker.IPv6(t, ipHdr, checker.ICMPv6(
+					checker.ICMPv6Type(test.ICMPType),
+					checker.ICMPv6Code(test.ICMPCode)))
+
+				// We know we are looking at no extension headers in the error ICMP
+				// packets.
+				icmpPkt := header.ICMPv6(ipHdr.Payload())
+				// We know we sent small packets that won't be truncated when reflected
+				// back to us.
+				originalPacket := icmpPkt.Payload()
+				if got, want := icmpPkt.TypeSpecific(), test.pointer; got != want {
+					t.Errorf("unexpected ICMPv6 pointer, got = %d, want = %d\n", got, want)
+				}
+				if diff := cmp.Diff(hdr.View(), buffer.View(originalPacket)); diff != "" {
+					t.Errorf("ICMPv6 payload mismatch (-want +got):\n%s", diff)
+				}
 				return
 			}
 
@@ -681,12 +977,12 @@ type fragmentData struct {
 
 func TestReceiveIPv6Fragments(t *testing.T) {
 	const (
-		nicID             = 1
 		udpPayload1Length = 256
 		udpPayload2Length = 128
 		// Used to test cases where the fragment blocks are not a multiple of
 		// the fragment block size of 8 (RFC 8200 section 4.5).
 		udpPayload3Length = 127
+		udpPayload4Length = header.IPv6MaximumPayloadSize - header.UDPMinimumSize
 		fragmentExtHdrLen = 8
 		// Note, not all routing extension headers will be 8 bytes but this test
 		// uses 8 byte routing extension headers for most sub tests.
@@ -731,6 +1027,10 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 	udpPayload3Addr1ToAddr2 := udpPayload3Addr1ToAddr2Buf[:]
 	ipv6Payload3Addr1ToAddr2 := udpGen(udpPayload3Addr1ToAddr2, 3, addr1, addr2)
 
+	var udpPayload4Addr1ToAddr2Buf [udpPayload4Length]byte
+	udpPayload4Addr1ToAddr2 := udpPayload4Addr1ToAddr2Buf[:]
+	ipv6Payload4Addr1ToAddr2 := udpGen(udpPayload4Addr1ToAddr2, 4, addr1, addr2)
+
 	tests := []struct {
 		name             string
 		expectedPayload  []byte
@@ -866,6 +1166,46 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
 		{
+			name: "Two fragments with different Next Header values",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							// NextHeader value is different than the one in the first fragment, so
+							// this NextHeader should be ignored.
+							buffer.View([]byte{uint8(header.IPv6NoNextHeaderIdentifier), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
 			name: "Two fragments with last fragment size not a multiple of fragment block size",
 			fragments: []fragmentData{
 				{
@@ -980,6 +1320,44 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			expectedPayloads: nil,
 		},
 		{
+			name: "Two fragments reassembled into a maximum UDP packet",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+65520,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[:65520],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload4Addr1ToAddr2)-65520,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8190, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 255, 240, 0, 0, 0, 1}),
+
+							ipv6Payload4Addr1ToAddr2[65520:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload4Addr1ToAddr2},
+		},
+		{
 			name: "Two fragments with per-fragment routing header with zero segments left",
 			fragments: []fragmentData{
 				{
@@ -1464,10 +1842,10 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
-			e := channel.New(0, 1280, linkAddr1)
+			e := channel.New(0, header.IPv6MinimumMTU, linkAddr1)
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -1532,3 +1910,920 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 		})
 	}
 }
+
+func TestInvalidIPv6Fragments(t *testing.T) {
+	const (
+		addr1     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+		addr2     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+		linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+		nicID     = 1
+		hoplimit  = 255
+		ident     = 1
+		data      = "TEST_INVALID_IPV6_FRAGMENTS"
+	)
+
+	type fragmentData struct {
+		ipv6Fields         header.IPv6Fields
+		ipv6FragmentFields header.IPv6FragmentFields
+		payload            []byte
+	}
+
+	tests := []struct {
+		name                   string
+		fragments              []fragmentData
+		wantMalformedIPPackets uint64
+		wantMalformedFragments uint64
+		expectICMP             bool
+		expectICMPType         header.ICMPv6Type
+		expectICMPCode         header.ICMPv6Code
+		expectICMPTypeSpecific uint32
+	}{
+		{
+			name: "fragment size is not a multiple of 8 and the M flag is true",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 9,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0 >> 3,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:9],
+				},
+			},
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
+			expectICMP:             true,
+			expectICMPType:         header.ICMPv6ParamProblem,
+			expectICMPCode:         header.ICMPv6ErroneousHeader,
+			expectICMPTypeSpecific: header.IPv6PayloadLenOffset,
+		},
+		{
+			name: "fragments reassembled into a payload exceeding the max IPv6 payload size",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: ((header.IPv6MaximumPayloadSize + 1) - 16) >> 3,
+						M:              false,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			wantMalformedIPPackets: 1,
+			wantMalformedFragments: 1,
+			expectICMP:             true,
+			expectICMPType:         header.ICMPv6ParamProblem,
+			expectICMPCode:         header.ICMPv6ErroneousHeader,
+			expectICMPTypeSpecific: header.IPv6MinimumSize + 2, /* offset for 'Fragment Offset' in the fragment header */
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					NewProtocol,
+				},
+			})
+			e := channel.New(1, 1500, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				NIC:         nicID,
+			}})
+
+			var expectICMPPayload buffer.View
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize)
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize))
+				ip.Encode(&f.ipv6Fields)
+
+				fragHDR := header.IPv6Fragment(hdr.View()[header.IPv6MinimumSize:])
+				fragHDR.Encode(&f.ipv6FragmentFields)
+
+				vv := hdr.View().ToVectorisedView()
+				vv.AppendView(f.payload)
+
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
+				})
+
+				if test.expectICMP {
+					expectICMPPayload = stack.PayloadSince(pkt.NetworkHeader())
+				}
+
+				e.InjectInbound(ProtocolNumber, pkt)
+			}
+
+			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), test.wantMalformedIPPackets; got != want {
+				t.Errorf("got Stats.IP.MalformedPacketsReceived = %d, want = %d", got, want)
+			}
+			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), test.wantMalformedFragments; got != want {
+				t.Errorf("got Stats.IP.MalformedFragmentsReceived = %d, want = %d", got, want)
+			}
+
+			reply, ok := e.Read()
+			if !test.expectICMP {
+				if ok {
+					t.Fatalf("unexpected ICMP error message received: %#v", reply)
+				}
+				return
+			}
+			if !ok {
+				t.Fatal("expected ICMP error message missing")
+			}
+
+			checker.IPv6(t, stack.PayloadSince(reply.Pkt.NetworkHeader()),
+				checker.SrcAddr(addr2),
+				checker.DstAddr(addr1),
+				checker.IPFullLength(uint16(header.IPv6MinimumSize+header.ICMPv6MinimumSize+expectICMPPayload.Size())),
+				checker.ICMPv6(
+					checker.ICMPv6Type(test.expectICMPType),
+					checker.ICMPv6Code(test.expectICMPCode),
+					checker.ICMPv6TypeSpecific(test.expectICMPTypeSpecific),
+					checker.ICMPv6Payload([]byte(expectICMPPayload)),
+				),
+			)
+		})
+	}
+}
+
+func TestFragmentReassemblyTimeout(t *testing.T) {
+	const (
+		addr1     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+		addr2     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+		linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+		nicID     = 1
+		hoplimit  = 255
+		ident     = 1
+		data      = "TEST_FRAGMENT_REASSEMBLY_TIMEOUT"
+	)
+
+	type fragmentData struct {
+		ipv6Fields         header.IPv6Fields
+		ipv6FragmentFields header.IPv6FragmentFields
+		payload            []byte
+	}
+
+	tests := []struct {
+		name       string
+		fragments  []fragmentData
+		expectICMP bool
+	}{
+		{
+			name: "first fragment only",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "two first fragments",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "second fragment only",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16),
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 8,
+						M:              false,
+						Identification: ident,
+					},
+					payload: []byte(data)[16:],
+				},
+			},
+			expectICMP: false,
+		},
+		{
+			name: "two fragments with a gap",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16),
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 8,
+						M:              false,
+						Identification: ident,
+					},
+					payload: []byte(data)[16:],
+				},
+			},
+			expectICMP: true,
+		},
+		{
+			name: "two fragments with a gap in reverse order",
+			fragments: []fragmentData{
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: uint16(header.IPv6FragmentHeaderSize + len(data) - 16),
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 8,
+						M:              false,
+						Identification: ident,
+					},
+					payload: []byte(data)[16:],
+				},
+				{
+					ipv6Fields: header.IPv6Fields{
+						PayloadLength: header.IPv6FragmentHeaderSize + 16,
+						NextHeader:    header.IPv6FragmentHeader,
+						HopLimit:      hoplimit,
+						SrcAddr:       addr1,
+						DstAddr:       addr2,
+					},
+					ipv6FragmentFields: header.IPv6FragmentFields{
+						NextHeader:     uint8(header.UDPProtocolNumber),
+						FragmentOffset: 0,
+						M:              true,
+						Identification: ident,
+					},
+					payload: []byte(data)[:16],
+				},
+			},
+			expectICMP: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			clock := faketime.NewManualClock()
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{
+					NewProtocol,
+				},
+				Clock: clock,
+			})
+
+			e := channel.New(1, 1500, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr2, err)
+			}
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				NIC:         nicID,
+			}})
+
+			var firstFragmentSent buffer.View
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize)
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize + header.IPv6FragmentHeaderSize))
+				ip.Encode(&f.ipv6Fields)
+
+				fragHDR := header.IPv6Fragment(hdr.View()[header.IPv6MinimumSize:])
+				fragHDR.Encode(&f.ipv6FragmentFields)
+
+				vv := hdr.View().ToVectorisedView()
+				vv.AppendView(f.payload)
+
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: vv,
+				})
+
+				if firstFragmentSent == nil && fragHDR.FragmentOffset() == 0 {
+					firstFragmentSent = stack.PayloadSince(pkt.NetworkHeader())
+				}
+
+				e.InjectInbound(ProtocolNumber, pkt)
+			}
+
+			clock.Advance(ReassembleTimeout)
+
+			reply, ok := e.Read()
+			if !test.expectICMP {
+				if ok {
+					t.Fatalf("unexpected ICMP error message received: %#v", reply)
+				}
+				return
+			}
+			if !ok {
+				t.Fatal("expected ICMP error message missing")
+			}
+			if firstFragmentSent == nil {
+				t.Fatalf("unexpected ICMP error message received: %#v", reply)
+			}
+
+			checker.IPv6(t, stack.PayloadSince(reply.Pkt.NetworkHeader()),
+				checker.SrcAddr(addr2),
+				checker.DstAddr(addr1),
+				checker.IPFullLength(uint16(header.IPv6MinimumSize+header.ICMPv6MinimumSize+firstFragmentSent.Size())),
+				checker.ICMPv6(
+					checker.ICMPv6Type(header.ICMPv6TimeExceeded),
+					checker.ICMPv6Code(header.ICMPv6ReassemblyTimeout),
+					checker.ICMPv6Payload([]byte(firstFragmentSent)),
+				),
+			)
+		})
+	}
+}
+
+func TestWriteStats(t *testing.T) {
+	const nPackets = 3
+	tests := []struct {
+		name          string
+		setup         func(*testing.T, *stack.Stack)
+		allowPackets  int
+		expectSent    int
+		expectDropped int
+		expectWritten int
+	}{
+		{
+			name: "Accept all",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets,
+			expectDropped: 0,
+			expectWritten: nPackets,
+		}, {
+			name: "Accept all with error",
+			// No setup needed, tables accept everything by default.
+			setup:         func(*testing.T, *stack.Stack) {},
+			allowPackets:  nPackets - 1,
+			expectSent:    nPackets - 1,
+			expectDropped: 0,
+			expectWritten: nPackets - 1,
+		}, {
+			name: "Drop all",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %v", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    0,
+			expectDropped: nPackets,
+			expectWritten: nPackets,
+		}, {
+			name: "Drop some",
+			setup: func(t *testing.T, stk *stack.Stack) {
+				// Install Output DROP rule that matches only 1
+				// of the 3 packets.
+				t.Helper()
+				ipt := stk.IPTables()
+				filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+				if !ok {
+					t.Fatalf("failed to find filter table")
+				}
+				// We'll match and DROP the last packet.
+				ruleIdx := filter.BuiltinChains[stack.Output]
+				filter.Rules[ruleIdx].Target = &stack.DropTarget{}
+				filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+				// Make sure the next rule is ACCEPT.
+				filter.Rules[ruleIdx+1].Target = &stack.AcceptTarget{}
+				if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+					t.Fatalf("failed to replace table: %v", err)
+				}
+			},
+			allowPackets:  math.MaxInt32,
+			expectSent:    nPackets - 1,
+			expectDropped: 1,
+			expectWritten: nPackets,
+		},
+	}
+
+	writers := []struct {
+		name         string
+		writePackets func(*stack.Route, stack.PacketBufferList) (int, *tcpip.Error)
+	}{
+		{
+			name: "WritePacket",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				nWritten := 0
+				for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+					if err := rt.WritePacket(nil, stack.NetworkHeaderParams{}, pkt); err != nil {
+						return nWritten, err
+					}
+					nWritten++
+				}
+				return nWritten, nil
+			},
+		}, {
+			name: "WritePackets",
+			writePackets: func(rt *stack.Route, pkts stack.PacketBufferList) (int, *tcpip.Error) {
+				return rt.WritePackets(nil, pkts, stack.NetworkHeaderParams{})
+			},
+		},
+	}
+
+	for _, writer := range writers {
+		t.Run(writer.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					ep := testutil.NewMockLinkEndpoint(header.IPv6MinimumMTU, tcpip.ErrInvalidEndpointState, test.allowPackets)
+					rt := buildRoute(t, ep)
+					var pkts stack.PacketBufferList
+					for i := 0; i < nPackets; i++ {
+						pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+							ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+							Data:               buffer.NewView(0).ToVectorisedView(),
+						})
+						pkt.TransportHeader().Push(header.UDPMinimumSize)
+						pkts.PushBack(pkt)
+					}
+
+					test.setup(t, rt.Stack())
+
+					nWritten, _ := writer.writePackets(&rt, pkts)
+
+					if got := int(rt.Stats().IP.PacketsSent.Value()); got != test.expectSent {
+						t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+					}
+					if got := int(rt.Stats().IP.IPTablesOutputDropped.Value()); got != test.expectDropped {
+						t.Errorf("dropped %d packets, but expected to drop %d", got, test.expectDropped)
+					}
+					if nWritten != test.expectWritten {
+						t.Errorf("wrote %d packets, but expected WritePackets to return %d", nWritten, test.expectWritten)
+					}
+				})
+			}
+		})
+	}
+}
+
+func buildRoute(t *testing.T, ep stack.LinkEndpoint) stack.Route {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC(1, _) failed: %s", err)
+	}
+	const (
+		src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+		dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	)
+	if err := s.AddAddress(1, ProtocolNumber, src); err != nil {
+		t.Fatalf("AddAddress(1, %d, %s) failed: %s", ProtocolNumber, src, err)
+	}
+	{
+		mask := tcpip.AddressMask("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff")
+		subnet, err := tcpip.NewSubnet(dst, mask)
+		if err != nil {
+			t.Fatalf("NewSubnet(%s, %s) failed: %v", dst, mask, err)
+		}
+		s.SetRouteTable([]tcpip.Route{{
+			Destination: subnet,
+			NIC:         1,
+		}})
+	}
+	rt, err := s.FindRoute(1, src, dst, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %s, %s, %d, false) = %s, want = nil", src, dst, ProtocolNumber, err)
+	}
+	return rt
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+	limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+	return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+	if lm.limit == 0 {
+		return true, false
+	}
+	lm.limit--
+	return false, false
+}
+
+func TestClearEndpointFromProtocolOnClose(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+	})
+	proto := s.NetworkProtocolInstance(ProtocolNumber).(*protocol)
+	ep := proto.NewEndpoint(&testInterface{}, nil, nil, nil).(*endpoint)
+	{
+		proto.mu.Lock()
+		_, hasEP := proto.mu.eps[ep]
+		proto.mu.Unlock()
+		if !hasEP {
+			t.Fatalf("expected protocol to have ep = %p in set of endpoints", ep)
+		}
+	}
+
+	ep.Close()
+
+	{
+		proto.mu.Lock()
+		_, hasEP := proto.mu.eps[ep]
+		proto.mu.Unlock()
+		if hasEP {
+			t.Fatalf("unexpectedly found ep = %p in set of protocol's endpoints", ep)
+		}
+	}
+}
+
+type fragmentInfo struct {
+	offset      uint16
+	more        bool
+	payloadSize uint16
+}
+
+var fragmentationTests = []struct {
+	description   string
+	mtu           uint32
+	gso           *stack.GSO
+	transHdrLen   int
+	payloadSize   int
+	wantFragments []fragmentInfo
+}{
+	{
+		description: "No fragmentation",
+		mtu:         header.IPv6MinimumMTU,
+		gso:         nil,
+		transHdrLen: 0,
+		payloadSize: 1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1000, more: false},
+		},
+	},
+	{
+		description: "Fragmented",
+		mtu:         header.IPv6MinimumMTU,
+		gso:         nil,
+		transHdrLen: 0,
+		payloadSize: 2000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 776, more: false},
+		},
+	},
+	{
+		description: "Fragmented with mtu not a multiple of 8",
+		mtu:         header.IPv6MinimumMTU + 1,
+		gso:         nil,
+		transHdrLen: 0,
+		payloadSize: 2000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 776, more: false},
+		},
+	},
+	{
+		description: "No fragmentation with big header",
+		mtu:         2000,
+		gso:         nil,
+		transHdrLen: 100,
+		payloadSize: 1000,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1100, more: false},
+		},
+	},
+	{
+		description: "Fragmented with gso none",
+		mtu:         header.IPv6MinimumMTU,
+		gso:         &stack.GSO{Type: stack.GSONone},
+		transHdrLen: 0,
+		payloadSize: 1400,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 176, more: false},
+		},
+	},
+	{
+		description: "Fragmented with big header",
+		mtu:         header.IPv6MinimumMTU,
+		gso:         nil,
+		transHdrLen: 100,
+		payloadSize: 1200,
+		wantFragments: []fragmentInfo{
+			{offset: 0, payloadSize: 1240, more: true},
+			{offset: 154, payloadSize: 76, more: false},
+		},
+	},
+}
+
+func TestFragmentationWritePacket(t *testing.T) {
+	const (
+		ttl            = 42
+		tos            = stack.DefaultTOS
+		transportProto = tcp.ProtocolNumber
+	)
+
+	for _, ft := range fragmentationTests {
+		t.Run(ft.description, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+			source := pkt.Clone()
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+			r := buildRoute(t, ep)
+			err := r.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      ttl,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
+			if err != nil {
+				t.Fatalf("WritePacket(_, _, _): = %s", err)
+			}
+			if got := len(ep.WrittenPackets); got != len(ft.wantFragments) {
+				t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != len(ft.wantFragments) {
+				t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, len(ft.wantFragments))
+			}
+			if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+			}
+			if err := compareFragments(ep.WrittenPackets, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+				t.Error(err)
+			}
+		})
+	}
+}
+
+func TestFragmentationWritePackets(t *testing.T) {
+	const ttl = 42
+	tests := []struct {
+		description  string
+		insertBefore int
+		insertAfter  int
+	}{
+		{
+			description:  "Single packet",
+			insertBefore: 0,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet before",
+			insertBefore: 1,
+			insertAfter:  0,
+		},
+		{
+			description:  "With packet after",
+			insertBefore: 0,
+			insertAfter:  1,
+		},
+		{
+			description:  "With packet before and after",
+			insertBefore: 1,
+			insertAfter:  1,
+		},
+	}
+	tinyPacket := testutil.MakeRandPkt(header.TCPMinimumSize, extraHeaderReserve+header.IPv6MinimumSize, []int{1}, header.IPv6ProtocolNumber)
+
+	for _, test := range tests {
+		t.Run(test.description, func(t *testing.T) {
+			for _, ft := range fragmentationTests {
+				t.Run(ft.description, func(t *testing.T) {
+					var pkts stack.PacketBufferList
+					for i := 0; i < test.insertBefore; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+					pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+					source := pkt
+					pkts.PushBack(pkt.Clone())
+					for i := 0; i < test.insertAfter; i++ {
+						pkts.PushBack(tinyPacket.Clone())
+					}
+
+					ep := testutil.NewMockLinkEndpoint(ft.mtu, nil, math.MaxInt32)
+					r := buildRoute(t, ep)
+
+					wantTotalPackets := len(ft.wantFragments) + test.insertBefore + test.insertAfter
+					n, err := r.WritePackets(ft.gso, pkts, stack.NetworkHeaderParams{
+						Protocol: tcp.ProtocolNumber,
+						TTL:      ttl,
+						TOS:      stack.DefaultTOS,
+					})
+					if n != wantTotalPackets || err != nil {
+						t.Errorf("got WritePackets(_, _, _) = (%d, %s), want = (%d, nil)", n, err, wantTotalPackets)
+					}
+					if got := len(ep.WrittenPackets); got != wantTotalPackets {
+						t.Errorf("got len(ep.WrittenPackets) = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := int(r.Stats().IP.PacketsSent.Value()); got != wantTotalPackets {
+						t.Errorf("got c.Route.Stats().IP.PacketsSent.Value() = %d, want = %d", got, wantTotalPackets)
+					}
+					if got := r.Stats().IP.OutgoingPacketErrors.Value(); got != 0 {
+						t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = 0", got)
+					}
+
+					if wantTotalPackets == 0 {
+						return
+					}
+
+					fragments := ep.WrittenPackets[test.insertBefore : len(ft.wantFragments)+test.insertBefore]
+					if err := compareFragments(fragments, source, ft.mtu, ft.wantFragments, tcp.ProtocolNumber); err != nil {
+						t.Error(err)
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestFragmentationErrors checks that errors are returned from WritePacket
+// correctly.
+func TestFragmentationErrors(t *testing.T) {
+	const ttl = 42
+
+	tests := []struct {
+		description    string
+		mtu            uint32
+		transHdrLen    int
+		payloadSize    int
+		allowPackets   int
+		outgoingErrors int
+		mockError      *tcpip.Error
+		wantError      *tcpip.Error
+	}{
+		{
+			description:    "No frag",
+			mtu:            2000,
+			payloadSize:    1000,
+			transHdrLen:    0,
+			allowPackets:   0,
+			outgoingErrors: 1,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error on first frag",
+			mtu:            1300,
+			payloadSize:    3000,
+			transHdrLen:    0,
+			allowPackets:   0,
+			outgoingErrors: 3,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error on second frag",
+			mtu:            1500,
+			payloadSize:    4000,
+			transHdrLen:    0,
+			allowPackets:   1,
+			outgoingErrors: 2,
+			mockError:      tcpip.ErrAborted,
+			wantError:      tcpip.ErrAborted,
+		},
+		{
+			description:    "Error when MTU is smaller than transport header",
+			mtu:            header.IPv6MinimumMTU,
+			transHdrLen:    1500,
+			payloadSize:    500,
+			allowPackets:   0,
+			outgoingErrors: 1,
+			mockError:      nil,
+			wantError:      tcpip.ErrMessageTooLong,
+		},
+		{
+			description:    "Error when MTU is smaller than IPv6 minimum MTU",
+			mtu:            header.IPv6MinimumMTU - 1,
+			transHdrLen:    0,
+			payloadSize:    500,
+			allowPackets:   0,
+			outgoingErrors: 1,
+			mockError:      nil,
+			wantError:      tcpip.ErrInvalidEndpointState,
+		},
+	}
+
+	for _, ft := range tests {
+		t.Run(ft.description, func(t *testing.T) {
+			pkt := testutil.MakeRandPkt(ft.transHdrLen, extraHeaderReserve+header.IPv6MinimumSize, []int{ft.payloadSize}, header.IPv6ProtocolNumber)
+			ep := testutil.NewMockLinkEndpoint(ft.mtu, ft.mockError, ft.allowPackets)
+			r := buildRoute(t, ep)
+			err := r.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      ttl,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
+			if err != ft.wantError {
+				t.Errorf("got WritePacket(_, _, _) = %s, want = %s", err, ft.wantError)
+			}
+			if got := int(r.Stats().IP.PacketsSent.Value()); got != ft.allowPackets {
+				t.Errorf("got r.Stats().IP.PacketsSent.Value() = %d, want = %d", got, ft.allowPackets)
+			}
+			if got := int(r.Stats().IP.OutgoingPacketErrors.Value()); got != ft.outgoingErrors {
+				t.Errorf("got r.Stats().IP.OutgoingPacketErrors.Value() = %d, want = %d", got, ft.outgoingErrors)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/network/ipv6/ndp.go
index b0873d1af..40da011f8 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/network/ipv6/ndp.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package stack
+package ipv6
 
 import (
 	"fmt"
@@ -23,9 +23,27 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
+	// defaultRetransmitTimer is the default amount of time to wait between
+	// sending reachability probes.
+	//
+	// Default taken from RETRANS_TIMER of RFC 4861 section 10.
+	defaultRetransmitTimer = time.Second
+
+	// minimumRetransmitTimer is the minimum amount of time to wait between
+	// sending reachability probes.
+	//
+	// Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here
+	// to make sure the messages are not sent all at once. We also come to this
+	// value because in the RetransmitTimer field of a Router Advertisement, a
+	// value of 0 means unspecified, so the smallest valid value is 1. Note, the
+	// unit of the RetransmitTimer field in the Router Advertisement is
+	// milliseconds.
+	minimumRetransmitTimer = time.Millisecond
+
 	// defaultDupAddrDetectTransmits is the default number of NDP Neighbor
 	// Solicitation messages to send when doing Duplicate Address Detection
 	// for a tentative address.
@@ -34,7 +52,7 @@ const (
 	defaultDupAddrDetectTransmits = 1
 
 	// defaultMaxRtrSolicitations is the default number of Router
-	// Solicitation messages to send when a NIC becomes enabled.
+	// Solicitation messages to send when an IPv6 endpoint becomes enabled.
 	//
 	// Default = 3 (from RFC 4861 section 10).
 	defaultMaxRtrSolicitations = 3
@@ -131,7 +149,7 @@ const (
 	minRegenAdvanceDuration = time.Duration(0)
 
 	// maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt
-	// SLAAC address regenerations in response to a NIC-local conflict.
+	// SLAAC address regenerations in response to an IPv6 endpoint-local conflict.
 	maxSLAACAddrLocalRegenAttempts = 10
 )
 
@@ -163,7 +181,7 @@ var (
 	// This is exported as a variable (instead of a constant) so tests
 	// can update it to a smaller value.
 	//
-	// This value guarantees that a temporary address will be preferred for at
+	// This value guarantees that a temporary address is preferred for at
 	// least 1hr if the SLAAC prefix is valid for at least that time.
 	MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour
 
@@ -173,11 +191,17 @@ var (
 	// This is exported as a variable (instead of a constant) so tests
 	// can update it to a smaller value.
 	//
-	// This value guarantees that a temporary address will be valid for at least
+	// This value guarantees that a temporary address is valid for at least
 	// 2hrs if the SLAAC prefix is valid for at least that time.
 	MinMaxTempAddrValidLifetime = 2 * time.Hour
 )
 
+// NDPEndpoint is an endpoint that supports NDP.
+type NDPEndpoint interface {
+	// SetNDPConfigurations sets the NDP configurations.
+	SetNDPConfigurations(NDPConfigurations)
+}
+
 // DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
 // NDP Router Advertisement informed the Stack about.
 type DHCPv6ConfigurationFromNDPRA int
@@ -192,7 +216,7 @@ const (
 	// DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
 	//
 	// DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
-	// will return all available configuration information.
+	// returns all available configuration information when serving addresses.
 	DHCPv6ManagedAddress
 
 	// DHCPv6OtherConfigurations indicates that other configuration information is
@@ -207,19 +231,18 @@ const (
 // NDPDispatcher is the interface integrators of netstack must implement to
 // receive and handle NDP related events.
 type NDPDispatcher interface {
-	// OnDuplicateAddressDetectionStatus will be called when the DAD process
-	// for an address (addr) on a NIC (with ID nicID) completes. resolved
-	// will be set to true if DAD completed successfully (no duplicate addr
-	// detected); false otherwise (addr was detected to be a duplicate on
-	// the link the NIC is a part of, or it was stopped for some other
-	// reason, such as the address being removed). If an error occured
-	// during DAD, err will be set and resolved must be ignored.
+	// OnDuplicateAddressDetectionStatus is called when the DAD process for an
+	// address (addr) on a NIC (with ID nicID) completes. resolved is set to true
+	// if DAD completed successfully (no duplicate addr detected); false otherwise
+	// (addr was detected to be a duplicate on the link the NIC is a part of, or
+	// it was stopped for some other reason, such as the address being removed).
+	// If an error occured during DAD, err is set and resolved must be ignored.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
 
-	// OnDefaultRouterDiscovered will be called when a new default router is
+	// OnDefaultRouterDiscovered is called when a new default router is
 	// discovered. Implementations must return true if the newly discovered
 	// router should be remembered.
 	//
@@ -227,56 +250,55 @@ type NDPDispatcher interface {
 	// is also not permitted to call into the stack.
 	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool
 
-	// OnDefaultRouterInvalidated will be called when a discovered default
-	// router that was remembered is invalidated.
+	// OnDefaultRouterInvalidated is called when a discovered default router that
+	// was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address)
 
-	// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
-	// discovered. Implementations must return true if the newly discovered
-	// on-link prefix should be remembered.
+	// OnOnLinkPrefixDiscovered is called when a new on-link prefix is discovered.
+	// Implementations must return true if the newly discovered on-link prefix
+	// should be remembered.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool
 
-	// OnOnLinkPrefixInvalidated will be called when a discovered on-link
-	// prefix that was remembered is invalidated.
+	// OnOnLinkPrefixInvalidated is called when a discovered on-link prefix that
+	// was remembered is invalidated.
 	//
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet)
 
-	// OnAutoGenAddress will be called when a new prefix with its
-	// autonomous address-configuration flag set has been received and SLAAC
-	// has been performed. Implementations may prevent the stack from
-	// assigning the address to the NIC by returning false.
+	// OnAutoGenAddress is called when a new prefix with its autonomous address-
+	// configuration flag set is received and SLAAC was performed. Implementations
+	// may prevent the stack from assigning the address to the NIC by returning
+	// false.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
 
-	// OnAutoGenAddressDeprecated will be called when an auto-generated
-	// address (as part of SLAAC) has been deprecated, but is still
-	// considered valid. Note, if an address is invalidated at the same
-	// time it is deprecated, the deprecation event MAY be omitted.
+	// OnAutoGenAddressDeprecated is called when an auto-generated address (SLAAC)
+	// is deprecated, but is still considered valid. Note, if an address is
+	// invalidated at the same ime it is deprecated, the deprecation event may not
+	// be received.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)
 
-	// OnAutoGenAddressInvalidated will be called when an auto-generated
-	// address (as part of SLAAC) has been invalidated.
+	// OnAutoGenAddressInvalidated is called when an auto-generated address
+	// (SLAAC) is invalidated.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
 
-	// OnRecursiveDNSServerOption will be called when an NDP option with
-	// recursive DNS servers has been received. Note, addrs may contain
-	// link-local addresses.
+	// OnRecursiveDNSServerOption is called when the stack learns of DNS servers
+	// through NDP. Note, the addresses may contain link-local addresses.
 	//
 	// It is up to the caller to use the DNS Servers only for their valid
 	// lifetime. OnRecursiveDNSServerOption may be called for new or
@@ -288,8 +310,8 @@ type NDPDispatcher interface {
 	// call functions on the stack itself.
 	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 
-	// OnDNSSearchListOption will be called when an NDP option with a DNS
-	// search list has been received.
+	// OnDNSSearchListOption is called when the stack learns of DNS search lists
+	// through NDP.
 	//
 	// It is up to the caller to use the domain names in the search list
 	// for only their valid lifetime. OnDNSSearchListOption may be called
@@ -298,8 +320,8 @@ type NDPDispatcher interface {
 	// be increased, decreased or completely invalidated when lifetime = 0.
 	OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration)
 
-	// OnDHCPv6Configuration will be called with an updated configuration that is
-	// available via DHCPv6 for a specified NIC.
+	// OnDHCPv6Configuration is called with an updated configuration that is
+	// available via DHCPv6 for the passed NIC.
 	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
@@ -320,7 +342,7 @@ type NDPConfigurations struct {
 	// Must be greater than or equal to 1ms.
 	RetransmitTimer time.Duration
 
-	// The number of Router Solicitation messages to send when the NIC
+	// The number of Router Solicitation messages to send when the IPv6 endpoint
 	// becomes enabled.
 	MaxRtrSolicitations uint8
 
@@ -335,24 +357,22 @@ type NDPConfigurations struct {
 	// Must be greater than or equal to 0s.
 	MaxRtrSolicitationDelay time.Duration
 
-	// HandleRAs determines whether or not Router Advertisements will be
-	// processed.
+	// HandleRAs determines whether or not Router Advertisements are processed.
 	HandleRAs bool
 
-	// DiscoverDefaultRouters determines whether or not default routers will
-	// be discovered from Router Advertisements. This configuration is
-	// ignored if HandleRAs is false.
+	// DiscoverDefaultRouters determines whether or not default routers are
+	// discovered from Router Advertisements, as per RFC 4861 section 6. This
+	// configuration is ignored if HandleRAs is false.
 	DiscoverDefaultRouters bool
 
-	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes
-	// will be discovered from Router Advertisements' Prefix Information
-	// option. This configuration is ignored if HandleRAs is false.
+	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes are
+	// discovered from Router Advertisements' Prefix Information option, as per
+	// RFC 4861 section 6. This configuration is ignored if HandleRAs is false.
 	DiscoverOnLinkPrefixes bool
 
-	// AutoGenGlobalAddresses determines whether or not global IPv6
-	// addresses will be generated for a NIC in response to receiving a new
-	// Prefix Information option with its Autonomous Address
-	// AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+	// AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs
+	// SLAAC to auto-generate global SLAAC addresses in response to Prefix
+	// Information options, as per RFC 4862.
 	//
 	// Note, if an address was already generated for some unique prefix, as
 	// part of SLAAC, this option does not affect whether or not the
@@ -366,12 +386,12 @@ type NDPConfigurations struct {
 	//
 	// If the method used to generate the address does not support creating
 	// alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
-	// MAC address), then no attempt will be made to resolve the conflict.
+	// MAC address), then no attempt is made to resolve the conflict.
 	AutoGenAddressConflictRetries uint8
 
 	// AutoGenTempGlobalAddresses determines whether or not temporary SLAAC
-	// addresses will be generated for a NIC as part of SLAAC privacy extensions,
-	// RFC 4941.
+	// addresses are generated for an IPv6 endpoint as part of SLAAC privacy
+	// extensions, as per RFC 4941.
 	//
 	// Ignored if AutoGenGlobalAddresses is false.
 	AutoGenTempGlobalAddresses bool
@@ -410,7 +430,7 @@ func DefaultNDPConfigurations() NDPConfigurations {
 }
 
 // validate modifies an NDPConfigurations with valid values. If invalid values
-// are present in c, the corresponding default values will be used instead.
+// are present in c, the corresponding default values are used instead.
 func (c *NDPConfigurations) validate() {
 	if c.RetransmitTimer < minimumRetransmitTimer {
 		c.RetransmitTimer = defaultRetransmitTimer
@@ -439,8 +459,8 @@ func (c *NDPConfigurations) validate() {
 
 // ndpState is the per-interface NDP state.
 type ndpState struct {
-	// The NIC this ndpState is for.
-	nic *NIC
+	// The IPv6 endpoint this ndpState is for.
+	ep *endpoint
 
 	// configs is the per-interface NDP configurations.
 	configs NDPConfigurations
@@ -458,8 +478,8 @@ type ndpState struct {
 		// Used to let the Router Solicitation timer know that it has been stopped.
 		//
 		// Must only be read from or written to while protected by the lock of
-		// the NIC this ndpState is associated with. MUST be set when the timer is
-		// set.
+		// the IPv6 endpoint this ndpState is associated with. MUST be set when the
+		// timer is set.
 		done *bool
 	}
 
@@ -492,7 +512,7 @@ type dadState struct {
 	// Used to let the DAD timer know that it has been stopped.
 	//
 	// Must only be read from or written to while protected by the lock of
-	// the NIC this dadState is associated with.
+	// the IPv6 endpoint this dadState is associated with.
 	done *bool
 }
 
@@ -537,7 +557,7 @@ type tempSLAACAddrState struct {
 	// The address's endpoint.
 	//
 	// Must not be nil.
-	ref *referencedNetworkEndpoint
+	addressEndpoint stack.AddressEndpoint
 
 	// Has a new temporary SLAAC address already been regenerated?
 	regenerated bool
@@ -567,10 +587,10 @@ type slaacPrefixState struct {
 		//
 		// May only be nil when the address is being (re-)generated. Otherwise,
 		// must not be nil as all SLAAC prefixes must have a stable address.
-		ref *referencedNetworkEndpoint
+		addressEndpoint stack.AddressEndpoint
 
-		// The number of times an address has been generated locally where the NIC
-		// already had the generated address.
+		// The number of times an address has been generated locally where the IPv6
+		// endpoint already had the generated address.
 		localGenerationFailures uint8
 	}
 
@@ -578,11 +598,12 @@ type slaacPrefixState struct {
 	tempAddrs map[tcpip.Address]tempSLAACAddrState
 
 	// The next two fields are used by both stable and temporary addresses
-	// generated for a SLAAC prefix. This is safe as only 1 address will be
-	// in the generation and DAD process at any time. That is, no two addresses
-	// will be generated at the same time for a given SLAAC prefix.
+	// generated for a SLAAC prefix. This is safe as only 1 address is in the
+	// generation and DAD process at any time. That is, no two addresses are
+	// generated at the same time for a given SLAAC prefix.
 
-	// The number of times an address has been generated and added to the NIC.
+	// The number of times an address has been generated and added to the IPv6
+	// endpoint.
 	//
 	// Addresses may be regenerated in reseponse to a DAD conflicts.
 	generationAttempts uint8
@@ -597,16 +618,16 @@ type slaacPrefixState struct {
 // This function must only be called by IPv6 addresses that are currently
 // tentative.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error {
 	// addr must be a valid unicast IPv6 address.
 	if !header.IsV6UnicastAddress(addr) {
 		return tcpip.ErrAddressFamilyNotSupported
 	}
 
-	if ref.getKind() != permanentTentative {
+	if addressEndpoint.GetKind() != stack.PermanentTentative {
 		// The endpoint should be marked as tentative since we are starting DAD.
-		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
 	}
 
 	// Should not attempt to perform DAD on an address that is currently in the
@@ -617,18 +638,18 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// existed, we would get an error since we attempted to add a duplicate
 		// address, or its reference count would have been increased without doing
 		// the work that would have been done for an address that was brand new.
-		// See NIC.addAddressLocked.
-		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+		// See endpoint.addAddressLocked.
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.ep.nic.ID()))
 	}
 
 	remaining := ndp.configs.DupAddrDetectTransmits
 	if remaining == 0 {
-		ref.setKind(permanent)
+		addressEndpoint.SetKind(stack.Permanent)
 
 		// Consider DAD to have resolved even if no DAD messages were actually
 		// transmitted.
-		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil)
+		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, true, nil)
 		}
 
 		return nil
@@ -637,25 +658,25 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	var done bool
 	var timer tcpip.Timer
 	// We initially start a timer to fire immediately because some of the DAD work
-	// cannot be done while holding the NIC's lock. This is effectively the same
-	// as starting a goroutine but we use a timer that fires immediately so we can
-	// reset it for the next DAD iteration.
-	timer = ndp.nic.stack.Clock().AfterFunc(0, func() {
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
+	// cannot be done while holding the IPv6 endpoint's lock. This is effectively
+	// the same as starting a goroutine but we use a timer that fires immediately
+	// so we can reset it for the next DAD iteration.
+	timer = ndp.ep.protocol.stack.Clock().AfterFunc(0, func() {
+		ndp.ep.mu.Lock()
+		defer ndp.ep.mu.Unlock()
 
 		if done {
 			// If we reach this point, it means that the DAD timer fired after
-			// another goroutine already obtained the NIC lock and stopped DAD
-			// before this function obtained the NIC lock. Simply return here and do
-			// nothing further.
+			// another goroutine already obtained the IPv6 endpoint lock and stopped
+			// DAD before this function obtained the NIC lock. Simply return here and
+			// do nothing further.
 			return
 		}
 
-		if ref.getKind() != permanentTentative {
+		if addressEndpoint.GetKind() != stack.PermanentTentative {
 			// The endpoint should still be marked as tentative since we are still
 			// performing DAD on it.
-			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID()))
+			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
 		}
 
 		dadDone := remaining == 0
@@ -663,33 +684,34 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		var err *tcpip.Error
 		if !dadDone {
 			// Use the unspecified address as the source address when performing DAD.
-			ref := ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+			addressEndpoint := ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
 
 			// Do not hold the lock when sending packets which may be a long running
 			// task or may block link address resolution. We know this is safe
 			// because immediately after obtaining the lock again, we check if DAD
-			// has been stopped before doing any work with the NIC. Note, DAD would be
-			// stopped if the NIC was disabled or removed, or if the address was
-			// removed.
-			ndp.nic.mu.Unlock()
-			err = ndp.sendDADPacket(addr, ref)
-			ndp.nic.mu.Lock()
+			// has been stopped before doing any work with the IPv6 endpoint. Note,
+			// DAD would be stopped if the IPv6 endpoint was disabled or closed, or if
+			// the address was removed.
+			ndp.ep.mu.Unlock()
+			err = ndp.sendDADPacket(addr, addressEndpoint)
+			ndp.ep.mu.Lock()
+			addressEndpoint.DecRef()
 		}
 
 		if done {
 			// If we reach this point, it means that DAD was stopped after we released
-			// the NIC's read lock and before we obtained the write lock.
+			// the IPv6 endpoint's read lock and before we obtained the write lock.
 			return
 		}
 
 		if dadDone {
 			// DAD has resolved.
-			ref.setKind(permanent)
+			addressEndpoint.SetKind(stack.Permanent)
 		} else if err == nil {
 			// DAD is not done and we had no errors when sending the last NDP NS,
 			// schedule the next DAD timer.
 			remaining--
-			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+			timer.Reset(ndp.configs.RetransmitTimer)
 			return
 		}
 
@@ -698,16 +720,16 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// integrator know DAD has completed.
 		delete(ndp.dad, addr)
 
-		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, dadDone, err)
+		if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, dadDone, err)
 		}
 
 		// If DAD resolved for a stable SLAAC address, attempt generation of a
 		// temporary SLAAC address.
-		if dadDone && ref.configType == slaac {
+		if dadDone && addressEndpoint.ConfigType() == stack.AddressConfigSlaac {
 			// Reset the generation attempts counter as we are starting the generation
 			// of a new address for the SLAAC prefix.
-			ndp.regenerateTempSLAACAddr(ref.addrWithPrefix().Subnet(), true /* resetGenAttempts */)
+			ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */)
 		}
 	})
 
@@ -722,28 +744,31 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 // sendDADPacket sends a NS message to see if any nodes on ndp's NIC's link owns
 // addr.
 //
-// addr must be a tentative IPv6 address on ndp's NIC.
+// addr must be a tentative IPv6 address on ndp's IPv6 endpoint.
 //
-// The NIC ndp belongs to MUST NOT be locked.
-func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+// The IPv6 endpoint that ndp belongs to MUST NOT be locked.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	r := makeRoute(header.IPv6ProtocolNumber, ref.address(), snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), header.IPv6Any, snmc, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
 	defer r.Release()
 
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
 		// Do not consider the NIC being unknown or disabled as a fatal error.
-		// Since this method is required to be called when the NIC is not locked,
-		// the NIC could have been disabled or removed by another goroutine.
+		// Since this method is required to be called when the IPv6 endpoint is not
+		// locked, the NIC could have been disabled or removed by another goroutine.
 		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
 			return err
 		}
 
-		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
+		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.ep.nic.ID(), err))
 	} else if c != nil {
-		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
+		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.ep.nic.ID()))
 	}
 
 	icmpData := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize))
@@ -752,17 +777,16 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEnd
 	ns.SetTargetAddress(addr)
 	icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-	pkt := NewPacketBuffer(PacketBufferOptions{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		ReserveHeaderBytes: int(r.MaxHeaderLength()),
 		Data:               buffer.View(icmpData).ToVectorisedView(),
 	})
 
 	sent := r.Stats().ICMP.V6PacketsSent
 	if err := r.WritePacket(nil,
-		NetworkHeaderParams{
+		stack.NetworkHeaderParams{
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
-			TOS:      DefaultTOS,
 		}, pkt,
 	); err != nil {
 		sent.Dropped.Increment()
@@ -778,11 +802,9 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEnd
 // such a state forever, unless some other external event resolves the DAD
 // process (receiving an NA from the true owner of addr, or an NS for addr
 // (implying another node is attempting to use addr)). It is up to the caller
-// of this function to handle such a scenario. Normally, addr will be removed
-// from n right after this function returns or the address successfully
-// resolved.
+// of this function to handle such a scenario.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	dad, ok := ndp.dad[addr]
 	if !ok {
@@ -801,30 +823,30 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
 	delete(ndp.dad, addr)
 
 	// Let the integrator know DAD did not resolve.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.ep.nic.ID(), addr, false, nil)
 	}
 }
 
 // handleRA handles a Router Advertisement message that arrived on the NIC
 // this ndp is for. Does nothing if the NIC is configured to not handle RAs.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
-	// Is the NIC configured to handle RAs at all?
+	// Is the IPv6 endpoint configured to handle RAs at all?
 	//
 	// Currently, the stack does not determine router interface status on a
-	// per-interface basis; it is a stack-wide configuration, so we check
-	// stack's forwarding flag to determine if the NIC is a routing
-	// interface.
-	if !ndp.configs.HandleRAs || ndp.nic.stack.forwarding {
+	// per-interface basis; it is a protocol-wide configuration, so we check the
+	// protocol's forwarding flag to determine if the IPv6 endpoint is forwarding
+	// packets.
+	if !ndp.configs.HandleRAs || ndp.ep.protocol.Forwarding() {
 		return
 	}
 
 	// Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
 	// only inform the dispatcher on configuration changes. We do nothing else
 	// with the information.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
 		var configuration DHCPv6ConfigurationFromNDPRA
 		switch {
 		case ra.ManagedAddrConfFlag():
@@ -839,11 +861,11 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 		if ndp.dhcpv6Configuration != configuration {
 			ndp.dhcpv6Configuration = configuration
-			ndpDisp.OnDHCPv6Configuration(ndp.nic.ID(), configuration)
+			ndpDisp.OnDHCPv6Configuration(ndp.ep.nic.ID(), configuration)
 		}
 	}
 
-	// Is the NIC configured to discover default routers?
+	// Is the IPv6 endpoint configured to discover default routers?
 	if ndp.configs.DiscoverDefaultRouters {
 		rtr, ok := ndp.defaultRouters[ip]
 		rl := ra.RouterLifetime()
@@ -881,20 +903,20 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
 		switch opt := opt.(type) {
 		case header.NDPRecursiveDNSServer:
-			if ndp.nic.stack.ndpDisp == nil {
+			if ndp.ep.protocol.ndpDisp == nil {
 				continue
 			}
 
 			addrs, _ := opt.Addresses()
-			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
+			ndp.ep.protocol.ndpDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime())
 
 		case header.NDPDNSSearchList:
-			if ndp.nic.stack.ndpDisp == nil {
+			if ndp.ep.protocol.ndpDisp == nil {
 				continue
 			}
 
 			domainNames, _ := opt.DomainNames()
-			ndp.nic.stack.ndpDisp.OnDNSSearchListOption(ndp.nic.ID(), domainNames, opt.Lifetime())
+			ndp.ep.protocol.ndpDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime())
 
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
@@ -928,7 +950,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 // invalidateDefaultRouter invalidates a discovered default router.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	rtr, ok := ndp.defaultRouters[ip]
 
@@ -942,32 +964,32 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDefaultRouterInvalidated(ndp.ep.nic.ID(), ip)
 	}
 }
 
 // rememberDefaultRouter remembers a newly discovered default router with IPv6
 // link-local address ip with lifetime rl.
 //
-// The router identified by ip MUST NOT already be known by the NIC.
+// The router identified by ip MUST NOT already be known by the IPv6 endpoint.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered a default router.
-	if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) {
+	if !ndpDisp.OnDefaultRouterDiscovered(ndp.ep.nic.ID(), ip) {
 		// Informed by the integrator to not remember the router, do
 		// nothing further.
 		return
 	}
 
 	state := defaultRouterState{
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
 	}
@@ -982,22 +1004,22 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 //
 // The prefix identified by prefix MUST NOT already be known.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return
 	}
 
 	// Inform the integrator when we discovered an on-link prefix.
-	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) {
+	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix) {
 		// Informed by the integrator to not remember the prefix, do
 		// nothing further.
 		return
 	}
 
 	state := onLinkPrefixState{
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			ndp.invalidateOnLinkPrefix(prefix)
 		}),
 	}
@@ -1011,7 +1033,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 
 // invalidateOnLinkPrefix invalidates a discovered on-link prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	s, ok := ndp.onLinkPrefixes[prefix]
 
@@ -1025,8 +1047,8 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix)
 	}
 }
 
@@ -1036,7 +1058,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 // handleOnLinkPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the on-link flag is set.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
 	prefix := pi.Subnet()
 	prefixState, ok := ndp.onLinkPrefixes[prefix]
@@ -1089,7 +1111,7 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio
 // handleAutonomousPrefixInformation assumes that the prefix this pi is for is
 // not the link-local prefix and the autonomous flag is set.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
 	vl := pi.ValidLifetime()
 	pl := pi.PreferredLifetime()
@@ -1125,7 +1147,7 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
@@ -1142,15 +1164,15 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	state := slaacPrefixState{
-		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
 			}
 
-			ndp.deprecateSLAACAddress(state.stableAddr.ref)
+			ndp.deprecateSLAACAddress(state.stableAddr.addressEndpoint)
 		}),
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
@@ -1189,7 +1211,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	// If the address is assigned (DAD resolved), generate a temporary address.
-	if state.stableAddr.ref.getKind() == permanent {
+	if state.stableAddr.addressEndpoint.GetKind() == stack.Permanent {
 		// Reset the generation attempts counter as we are starting the generation
 		// of a new address for the SLAAC prefix.
 		ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */)
@@ -1198,32 +1220,27 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	ndp.slaacPrefixes[prefix] = state
 }
 
-// addSLAACAddr adds a SLAAC address to the NIC.
+// addAndAcquireSLAACAddr adds a SLAAC address to the IPv6 endpoint.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType networkEndpointConfigType, deprecated bool) *referencedNetworkEndpoint {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, configType stack.AddressConfigType, deprecated bool) stack.AddressEndpoint {
 	// Inform the integrator that we have a new SLAAC address.
-	ndpDisp := ndp.nic.stack.ndpDisp
+	ndpDisp := ndp.ep.protocol.ndpDisp
 	if ndpDisp == nil {
 		return nil
 	}
 
-	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addr) {
+	if !ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr) {
 		// Informed by the integrator not to add the address.
 		return nil
 	}
 
-	protocolAddr := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addr,
-	}
-
-	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, configType, deprecated)
+	addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.FirstPrimaryEndpoint, configType, deprecated)
 	if err != nil {
-		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", protocolAddr, err))
+		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", addr, err))
 	}
 
-	return ref
+	return addressEndpoint
 }
 
 // generateSLAACAddr generates a SLAAC address for prefix.
@@ -1232,10 +1249,10 @@ func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType netwo
 //
 // Panics if the prefix is not a SLAAC prefix or it already has an address.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
-	if r := state.stableAddr.ref; r != nil {
-		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, r.addrWithPrefix()))
+	if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
+		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, addressEndpoint.AddressWithPrefix()))
 	}
 
 	// If we have already reached the maximum address generation attempts for the
@@ -1255,11 +1272,11 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 		}
 
 		dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
-		if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+		if oIID := ndp.ep.protocol.opaqueIIDOpts; oIID.NICNameFromID != nil {
 			addrBytes = header.AppendOpaqueInterfaceIdentifier(
 				addrBytes[:header.IIDOffsetInIPv6Address],
 				prefix,
-				oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
+				oIID.NICNameFromID(ndp.ep.nic.ID(), ndp.ep.nic.Name()),
 				dadCounter,
 				oIID.SecretKey,
 			)
@@ -1272,7 +1289,7 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 			//
 			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
 			// LinkEndpoint.LinkAddress) before reaching this point.
-			linkAddr := ndp.nic.linkEP.LinkAddress()
+			linkAddr := ndp.ep.nic.LinkAddress()
 			if !header.IsValidUnicastEthernetAddress(linkAddr) {
 				return false
 			}
@@ -1291,15 +1308,15 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 			PrefixLen: validPrefixLenForAutoGen,
 		}
 
-		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+		if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
 			break
 		}
 
 		state.stableAddr.localGenerationFailures++
 	}
 
-	if ref := ndp.addSLAACAddr(generatedAddr, slaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); ref != nil {
-		state.stableAddr.ref = ref
+	if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); addressEndpoint != nil {
+		state.stableAddr.addressEndpoint = addressEndpoint
 		state.generationAttempts++
 		return true
 	}
@@ -1309,10 +1326,9 @@ func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixSt
 
 // regenerateSLAACAddr regenerates an address for a SLAAC prefix.
 //
-// If generating a new address for the prefix fails, the prefix will be
-// invalidated.
+// If generating a new address for the prefix fails, the prefix is invalidated.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
 	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
@@ -1332,7 +1348,7 @@ func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
 
 // generateTempSLAACAddr generates a new temporary SLAAC address.
 //
-// If resetGenAttempts is true, the prefix's generation counter will be reset.
+// If resetGenAttempts is true, the prefix's generation counter is reset.
 //
 // Returns true if a new address was generated.
 func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool {
@@ -1353,7 +1369,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		return false
 	}
 
-	stableAddr := prefixState.stableAddr.ref.address()
+	stableAddr := prefixState.stableAddr.addressEndpoint.AddressWithPrefix().Address
 	now := time.Now()
 
 	// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
@@ -1392,7 +1408,8 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		return false
 	}
 
-	// Attempt to generate a new address that is not already assigned to the NIC.
+	// Attempt to generate a new address that is not already assigned to the IPv6
+	// endpoint.
 	var generatedAddr tcpip.AddressWithPrefix
 	for i := 0; ; i++ {
 		// If we were unable to generate an address after the maximum SLAAC address
@@ -1402,7 +1419,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		}
 
 		generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr)
-		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+		if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
 			break
 		}
 	}
@@ -1410,13 +1427,13 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 	// As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary
 	// address with a zero preferred lifetime. The checks above ensure this
 	// so we know the address is not deprecated.
-	ref := ndp.addSLAACAddr(generatedAddr, slaacTemp, false /* deprecated */)
-	if ref == nil {
+	addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaacTemp, false /* deprecated */)
+	if addressEndpoint == nil {
 		return false
 	}
 
 	state := tempSLAACAddrState{
-		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
@@ -1427,9 +1444,9 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 				panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr))
 			}
 
-			ndp.deprecateSLAACAddress(tempAddrState.ref)
+			ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
 		}),
-		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
@@ -1442,7 +1459,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
 		}),
-		regenJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
+		regenJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
@@ -1465,8 +1482,8 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 			prefixState.tempAddrs[generatedAddr.Address] = tempAddrState
 			ndp.slaacPrefixes[prefix] = prefixState
 		}),
-		createdAt: now,
-		ref:       ref,
+		createdAt:       now,
+		addressEndpoint: addressEndpoint,
 	}
 
 	state.deprecationJob.Schedule(pl)
@@ -1481,7 +1498,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 // regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) {
 	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
@@ -1496,14 +1513,14 @@ func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttemp
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) {
 	// If the preferred lifetime is zero, then the prefix should be deprecated.
 	deprecated := pl == 0
 	if deprecated {
-		ndp.deprecateSLAACAddress(prefixState.stableAddr.ref)
+		ndp.deprecateSLAACAddress(prefixState.stableAddr.addressEndpoint)
 	} else {
-		prefixState.stableAddr.ref.deprecated = false
+		prefixState.stableAddr.addressEndpoint.SetDeprecated(false)
 	}
 
 	// If prefix was preferred for some finite lifetime before, cancel the
@@ -1565,7 +1582,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 
 	// If DAD is not yet complete on the stable address, there is no need to do
 	// work with temporary addresses.
-	if prefixState.stableAddr.ref.getKind() != permanent {
+	if prefixState.stableAddr.addressEndpoint.GetKind() != stack.Permanent {
 		return
 	}
 
@@ -1608,9 +1625,9 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		newPreferredLifetime := preferredUntil.Sub(now)
 		tempAddrState.deprecationJob.Cancel()
 		if newPreferredLifetime <= 0 {
-			ndp.deprecateSLAACAddress(tempAddrState.ref)
+			ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
 		} else {
-			tempAddrState.ref.deprecated = false
+			tempAddrState.addressEndpoint.SetDeprecated(false)
 			tempAddrState.deprecationJob.Schedule(newPreferredLifetime)
 		}
 
@@ -1635,8 +1652,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	// due to an update in preferred lifetime.
 	//
 	// If each temporay address has already been regenerated, no new temporary
-	// address will be generated. To ensure continuation of temporary SLAAC
-	// addresses, we manually try to regenerate an address here.
+	// address is generated. To ensure continuation of temporary SLAAC addresses,
+	// we manually try to regenerate an address here.
 	if len(regenForAddr) != 0 || allAddressesRegenerated {
 		// Reset the generation attempts counter as we are starting the generation
 		// of a new address for the SLAAC prefix.
@@ -1647,57 +1664,58 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	}
 }
 
-// deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
-// dispatcher that ref has been deprecated.
+// deprecateSLAACAddress marks the address as deprecated and notifies the NDP
+// dispatcher that address has been deprecated.
 //
-// deprecateSLAACAddress does nothing if ref is already deprecated.
+// deprecateSLAACAddress does nothing if the address is already deprecated.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
-	if ref.deprecated {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint) {
+	if addressEndpoint.Deprecated() {
 		return
 	}
 
-	ref.deprecated = true
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), ref.addrWithPrefix())
+	addressEndpoint.SetDeprecated(true)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix())
 	}
 }
 
 // invalidateSLAACPrefix invalidates a SLAAC prefix.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
-	if r := state.stableAddr.ref; r != nil {
+	ndp.cleanupSLAACPrefixResources(prefix, state)
+
+	if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
 		// Since we are already invalidating the prefix, do not invalidate the
 		// prefix when removing the address.
-		if err := ndp.nic.removePermanentIPv6EndpointLocked(r, false /* allowSLAACInvalidation */); err != nil {
-			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", r.addrWithPrefix(), err))
+		if err := ndp.ep.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", addressEndpoint.AddressWithPrefix(), err))
 		}
 	}
-
-	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
 // cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
 // resources.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
 	prefix := addr.Subnet()
 	state, ok := ndp.slaacPrefixes[prefix]
-	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.address() {
+	if !ok || state.stableAddr.addressEndpoint == nil || addr.Address != state.stableAddr.addressEndpoint.AddressWithPrefix().Address {
 		return
 	}
 
 	if !invalidatePrefix {
 		// If the prefix is not being invalidated, disassociate the address from the
 		// prefix and do nothing further.
-		state.stableAddr.ref = nil
+		state.stableAddr.addressEndpoint.DecRef()
+		state.stableAddr.addressEndpoint = nil
 		ndp.slaacPrefixes[prefix] = state
 		return
 	}
@@ -1709,14 +1727,17 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr
 //
 // Panics if the SLAAC prefix is not known.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
 	// Invalidate all temporary addresses.
 	for tempAddr, tempAddrState := range state.tempAddrs {
 		ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState)
 	}
 
-	state.stableAddr.ref = nil
+	if state.stableAddr.addressEndpoint != nil {
+		state.stableAddr.addressEndpoint.DecRef()
+		state.stableAddr.addressEndpoint = nil
+	}
 	state.deprecationJob.Cancel()
 	state.invalidationJob.Cancel()
 	delete(ndp.slaacPrefixes, prefix)
@@ -1724,12 +1745,12 @@ func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaa
 
 // invalidateTempSLAACAddr invalidates a temporary SLAAC address.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
 	// Since we are already invalidating the address, do not invalidate the
 	// address when removing the address.
-	if err := ndp.nic.removePermanentIPv6EndpointLocked(tempAddrState.ref, false /* allowSLAACInvalidation */); err != nil {
-		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.ref.addrWithPrefix(), err))
+	if err := ndp.ep.removePermanentEndpointLocked(tempAddrState.addressEndpoint, false /* allowSLAACInvalidation */); err != nil {
+		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.addressEndpoint.AddressWithPrefix(), err))
 	}
 
 	ndp.cleanupTempSLAACAddrResources(tempAddrs, tempAddr, tempAddrState)
@@ -1738,10 +1759,10 @@ func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLA
 // cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary
 // SLAAC address's resources from ndp.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	if ndpDisp := ndp.ep.protocol.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
 	}
 
 	if !invalidateAddr {
@@ -1765,35 +1786,29 @@ func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWi
 // cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's
 // jobs and entry.
 //
-// The NIC that ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	tempAddrState.addressEndpoint.DecRef()
+	tempAddrState.addressEndpoint = nil
 	tempAddrState.deprecationJob.Cancel()
 	tempAddrState.invalidationJob.Cancel()
 	tempAddrState.regenJob.Cancel()
 	delete(tempAddrs, tempAddr)
 }
 
-// cleanupState cleans up ndp's state.
-//
-// If hostOnly is true, then only host-specific state will be cleaned up.
+// removeSLAACAddresses removes all SLAAC addresses.
 //
-// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
-// transitioning from a host to a router. This function will invalidate all
-// discovered on-link prefixes, discovered routers, and auto-generated
-// addresses.
-//
-// If hostOnly is true, then the link-local auto-generated address will not be
-// invalidated as routers are also expected to generate a link-local address.
+// If keepLinkLocal is false, the SLAAC generated link-local address is removed.
 //
-// The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupState(hostOnly bool) {
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) removeSLAACAddresses(keepLinkLocal bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
-	linkLocalPrefixes := 0
+	var linkLocalPrefixes int
 	for prefix, state := range ndp.slaacPrefixes {
 		// RFC 4862 section 5 states that routers are also expected to generate a
 		// link-local address so we do not invalidate them if we are cleaning up
 		// host-only state.
-		if hostOnly && prefix == linkLocalSubnet {
+		if keepLinkLocal && prefix == linkLocalSubnet {
 			linkLocalPrefixes++
 			continue
 		}
@@ -1804,6 +1819,21 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
 		panic(fmt.Sprintf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes))
 	}
+}
+
+// cleanupState cleans up ndp's state.
+//
+// If hostOnly is true, then only host-specific state is cleaned up.
+//
+// This function invalidates all discovered on-link prefixes, discovered
+// routers, and auto-generated addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address aren't
+// invalidated as routers are also expected to generate a link-local address.
+//
+// The IPv6 endpoint that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupState(hostOnly bool) {
+	ndp.removeSLAACAddresses(hostOnly /* keepLinkLocal */)
 
 	for prefix := range ndp.onLinkPrefixes {
 		ndp.invalidateOnLinkPrefix(prefix)
@@ -1827,7 +1857,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 // startSolicitingRouters starts soliciting routers, as per RFC 4861 section
 // 6.3.7. If routers are already being solicited, this function does nothing.
 //
-// The NIC ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) startSolicitingRouters() {
 	if ndp.rtrSolicit.timer != nil {
 		// We are already soliciting routers.
@@ -1848,27 +1878,37 @@ func (ndp *ndpState) startSolicitingRouters() {
 
 	var done bool
 	ndp.rtrSolicit.done = &done
-	ndp.rtrSolicit.timer = ndp.nic.stack.Clock().AfterFunc(delay, func() {
-		ndp.nic.mu.Lock()
+	ndp.rtrSolicit.timer = ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() {
+		ndp.ep.mu.Lock()
 		if done {
 			// If we reach this point, it means that the RS timer fired after another
-			// goroutine already obtained the NIC lock and stopped solicitations.
-			// Simply return here and do nothing further.
-			ndp.nic.mu.Unlock()
+			// goroutine already obtained the IPv6 endpoint lock and stopped
+			// solicitations. Simply return here and do nothing further.
+			ndp.ep.mu.Unlock()
 			return
 		}
 
 		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
 		// to the sending interface, or the unspecified address if no address is
 		// assigned to the sending interface.
-		ref := ndp.nic.primaryIPv6EndpointRLocked(header.IPv6AllRoutersMulticastAddress)
-		if ref == nil {
-			ref = ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+		addressEndpoint := ndp.ep.acquireOutgoingPrimaryAddressRLocked(header.IPv6AllRoutersMulticastAddress, false)
+		if addressEndpoint == nil {
+			// Incase this ends up creating a new temporary address, we need to hold
+			// onto the endpoint until a route is obtained. If we decrement the
+			// reference count before obtaing a route, the address's resources would
+			// be released and attempting to obtain a route after would fail. Once a
+			// route is obtainted, it is safe to decrement the reference count since
+			// obtaining a route increments the address's reference count.
+			addressEndpoint = ndp.ep.acquireAddressOrCreateTempLocked(header.IPv6Any, true /* createTemp */, stack.NeverPrimaryEndpoint)
 		}
-		ndp.nic.mu.Unlock()
+		ndp.ep.mu.Unlock()
 
-		localAddr := ref.address()
-		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		localAddr := addressEndpoint.AddressWithPrefix().Address
+		r, err := ndp.ep.protocol.stack.FindRoute(ndp.ep.nic.ID(), localAddr, header.IPv6AllRoutersMulticastAddress, ProtocolNumber, false /* multicastLoop */)
+		addressEndpoint.DecRef()
+		if err != nil {
+			return
+		}
 		defer r.Release()
 
 		// Route should resolve immediately since
@@ -1876,15 +1916,16 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
 			// Do not consider the NIC being unknown or disabled as a fatal error.
-			// Since this method is required to be called when the NIC is not locked,
-			// the NIC could have been disabled or removed by another goroutine.
+			// Since this method is required to be called when the IPv6 endpoint is
+			// not locked, the IPv6 endpoint could have been disabled or removed by
+			// another goroutine.
 			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
 				return
 			}
 
-			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
+			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID(), err))
 		} else if c != nil {
-			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
+			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.ep.nic.ID()))
 		}
 
 		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
@@ -1907,21 +1948,20 @@ func (ndp *ndpState) startSolicitingRouters() {
 		rs.Options().Serialize(optsSerializer)
 		icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
-		pkt := NewPacketBuffer(PacketBufferOptions{
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			ReserveHeaderBytes: int(r.MaxHeaderLength()),
 			Data:               buffer.View(icmpData).ToVectorisedView(),
 		})
 
 		sent := r.Stats().ICMP.V6PacketsSent
 		if err := r.WritePacket(nil,
-			NetworkHeaderParams{
+			stack.NetworkHeaderParams{
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
-				TOS:      DefaultTOS,
 			}, pkt,
 		); err != nil {
 			sent.Dropped.Increment()
-			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
+			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.ep.nic.ID(), err)
 			// Don't send any more messages if we had an error.
 			remaining = 0
 		} else {
@@ -1929,19 +1969,19 @@ func (ndp *ndpState) startSolicitingRouters() {
 			remaining--
 		}
 
-		ndp.nic.mu.Lock()
+		ndp.ep.mu.Lock()
 		if done || remaining == 0 {
 			ndp.rtrSolicit.timer = nil
 			ndp.rtrSolicit.done = nil
 		} else if ndp.rtrSolicit.timer != nil {
 			// Note, we need to explicitly check to make sure that
 			// the timer field is not nil because if it was nil but
-			// we still reached this point, then we know the NIC
+			// we still reached this point, then we know the IPv6 endpoint
 			// was requested to stop soliciting routers so we don't
 			// need to send the next Router Solicitation message.
 			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
 		}
-		ndp.nic.mu.Unlock()
+		ndp.ep.mu.Unlock()
 	})
 
 }
@@ -1949,7 +1989,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 // stopSolicitingRouters stops soliciting routers. If routers are not currently
 // being solicited, this function does nothing.
 //
-// The NIC ndp belongs to MUST be locked.
+// The IPv6 endpoint that ndp belongs to MUST be locked.
 func (ndp *ndpState) stopSolicitingRouters() {
 	if ndp.rtrSolicit.timer == nil {
 		// Nothing to do.
@@ -1965,7 +2005,7 @@ func (ndp *ndpState) stopSolicitingRouters() {
 // initializeTempAddrState initializes state related to temporary SLAAC
 // addresses.
 func (ndp *ndpState) initializeTempAddrState() {
-	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.nic.stack.tempIIDSeed, ndp.nic.ID())
+	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.tempIIDSeed, ndp.ep.nic.ID())
 
 	if MaxDesyncFactor != 0 {
 		ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor)))
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index af71a7d6b..ac20f217e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -15,9 +15,12 @@
 package ipv6
 
 import (
+	"context"
 	"strings"
 	"testing"
+	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -30,12 +33,13 @@ import (
 // setupStackAndEndpoint creates a stack with a single NIC with a link-local
 // address llladdr and an IPv6 endpoint to a remote with link-local address
 // rlladdr
-func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack.Stack, stack.NetworkEndpoint) {
+func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address, useNeighborCache bool) (*stack.Stack, stack.NetworkEndpoint) {
 	t.Helper()
 
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol6},
+		UseNeighborCache:   useNeighborCache,
 	})
 
 	if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
@@ -63,11 +67,94 @@ func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack
 		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
 	}
 
-	ep := netProto.NewEndpoint(0, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	ep := netProto.NewEndpoint(&testInterface{}, &stubLinkAddressCache{}, &stubNUDHandler{}, &stubDispatcher{})
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+	t.Cleanup(ep.Close)
 
 	return s, ep
 }
 
+var _ NDPDispatcher = (*testNDPDispatcher)(nil)
+
+// testNDPDispatcher is an NDPDispatcher only allows default router discovery.
+type testNDPDispatcher struct {
+	addr tcpip.Address
+}
+
+func (*testNDPDispatcher) OnDuplicateAddressDetectionStatus(tcpip.NICID, tcpip.Address, bool, *tcpip.Error) {
+}
+
+func (t *testNDPDispatcher) OnDefaultRouterDiscovered(_ tcpip.NICID, addr tcpip.Address) bool {
+	t.addr = addr
+	return true
+}
+
+func (t *testNDPDispatcher) OnDefaultRouterInvalidated(_ tcpip.NICID, addr tcpip.Address) {
+	t.addr = addr
+}
+
+func (*testNDPDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool {
+	return false
+}
+
+func (*testNDPDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) {
+}
+
+func (*testNDPDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool {
+	return false
+}
+
+func (*testNDPDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) {
+}
+
+func (*testNDPDispatcher) OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) {
+}
+
+func (*testNDPDispatcher) OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration) {
+}
+
+func (*testNDPDispatcher) OnDNSSearchListOption(tcpip.NICID, []string, time.Duration) {
+}
+
+func (*testNDPDispatcher) OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA) {
+}
+
+func TestStackNDPEndpointInvalidateDefaultRouter(t *testing.T) {
+	var ndpDisp testNDPDispatcher
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocolWithOptions(Options{
+			NDPDisp: &ndpDisp,
+		})},
+	})
+
+	if err := s.CreateNIC(nicID, &stubLinkEndpoint{}); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	ep, err := s.GetNetworkEndpoint(nicID, ProtocolNumber)
+	if err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, ProtocolNumber, err)
+	}
+
+	ipv6EP := ep.(*endpoint)
+	ipv6EP.mu.Lock()
+	ipv6EP.mu.ndp.rememberDefaultRouter(lladdr1, time.Hour)
+	ipv6EP.mu.Unlock()
+
+	if ndpDisp.addr != lladdr1 {
+		t.Fatalf("got ndpDisp.addr = %s, want = %s", ndpDisp.addr, lladdr1)
+	}
+
+	ndpDisp.addr = ""
+	ndpEP := ep.(stack.NDPEndpoint)
+	ndpEP.InvalidateDefaultRouter(lladdr1)
+	if ndpDisp.addr != lladdr1 {
+		t.Fatalf("got ndpDisp.addr = %s, want = %s", ndpDisp.addr, lladdr1)
+	}
+}
+
 // TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a
 // valid NDP NS message with the Source Link Layer Address option results in a
 // new entry in the link address cache for the sender of the message.
@@ -97,7 +184,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			e := channel.New(0, 1280, linkAddr0)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -171,6 +258,123 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	}
 }
 
+// TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NS message with the Source Link Layer Address
+// option results in a new entry in the link address cache for the sender of
+// the message.
+func TestNeighorSolicitationWithSourceLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr0)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; len(test.expectedLinkAddr) != 0 {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+
+				if !ok {
+					t.Fatalf("expected a neighbor entry for %q", lladdr1)
+				}
+				if neigh.LinkAddr != test.expectedLinkAddr {
+					t.Errorf("got link address = %s, want = %s", neigh.LinkAddr, test.expectedLinkAddr)
+				}
+				if neigh.State != stack.Stale {
+					t.Errorf("got NUD state = %s, want = %s", neigh.State, stack.Stale)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+
+				if ok {
+					t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+				}
+			}
+		})
+	}
+}
+
 func TestNeighorSolicitationResponse(t *testing.T) {
 	const nicID = 1
 	nicAddr := lladdr0
@@ -180,26 +384,41 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 	remoteLinkAddr0 := linkAddr1
 	remoteLinkAddr1 := linkAddr2
 
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
-		name          string
-		nsOpts        header.NDPOptionsSerializer
-		nsSrcLinkAddr tcpip.LinkAddress
-		nsSrc         tcpip.Address
-		nsDst         tcpip.Address
-		nsInvalid     bool
-		naDstLinkAddr tcpip.LinkAddress
-		naSolicited   bool
-		naSrc         tcpip.Address
-		naDst         tcpip.Address
+		name                   string
+		nsOpts                 header.NDPOptionsSerializer
+		nsSrcLinkAddr          tcpip.LinkAddress
+		nsSrc                  tcpip.Address
+		nsDst                  tcpip.Address
+		nsInvalid              bool
+		naDstLinkAddr          tcpip.LinkAddress
+		naSolicited            bool
+		naSrc                  tcpip.Address
+		naDst                  tcpip.Address
+		performsLinkResolution bool
 	}{
 		{
-			name:          "Unspecified source to multicast destination",
+			name:          "Unspecified source to solicited-node multicast destination",
 			nsOpts:        nil,
 			nsSrcLinkAddr: remoteLinkAddr0,
 			nsSrc:         header.IPv6Any,
 			nsDst:         nicAddrSNMC,
 			nsInvalid:     false,
-			naDstLinkAddr: remoteLinkAddr0,
+			naDstLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllNodesMulticastAddress),
 			naSolicited:   false,
 			naSrc:         nicAddr,
 			naDst:         header.IPv6AllNodesMulticastAddress,
@@ -220,11 +439,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			nsSrcLinkAddr: remoteLinkAddr0,
 			nsSrc:         header.IPv6Any,
 			nsDst:         nicAddr,
-			nsInvalid:     false,
-			naDstLinkAddr: remoteLinkAddr0,
-			naSolicited:   false,
-			naSrc:         nicAddr,
-			naDst:         header.IPv6AllNodesMulticastAddress,
+			nsInvalid:     true,
 		},
 		{
 			name: "Unspecified source with source ll option to unicast destination",
@@ -236,7 +451,6 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			nsDst:         nicAddr,
 			nsInvalid:     true,
 		},
-
 		{
 			name: "Specified source with 1 source ll to multicast destination",
 			nsOpts: header.NDPOptionsSerializer{
@@ -296,6 +510,10 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 			naSolicited:   true,
 			naSrc:         nicAddr,
 			naDst:         remoteAddr,
+			// Since we send a unicast solicitations to a node without an entry for
+			// the remote, the node needs to perform neighbor discovery to get the
+			// remote's link address to send the advertisement response.
+			performsLinkResolution: true,
 		},
 		{
 			name: "Specified source with 1 source ll to unicast destination",
@@ -338,86 +556,159 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
-			e := channel.New(1, 1280, nicLinkAddr)
-			if err := s.CreateNIC(nicID, e); err != nil {
-				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-			}
-			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
-				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
-			}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+					e := channel.New(1, 1280, nicLinkAddr)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					if err := s.CreateNIC(nicID, e); err != nil {
+						t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+					}
+					if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+					}
 
-			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
-			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
-			pkt.SetType(header.ICMPv6NeighborSolicit)
-			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-			ns.SetTargetAddress(nicAddr)
-			opts := ns.Options()
-			opts.Serialize(test.nsOpts)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
-			payloadLength := hdr.UsedLength()
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
-				HopLimit:      255,
-				SrcAddr:       test.nsSrc,
-				DstAddr:       test.nsDst,
-			})
+					ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+					pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+					pkt.SetType(header.ICMPv6NeighborSolicit)
+					ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+					ns.SetTargetAddress(nicAddr)
+					opts := ns.Options()
+					opts.Serialize(test.nsOpts)
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+					payloadLength := hdr.UsedLength()
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+						HopLimit:      255,
+						SrcAddr:       test.nsSrc,
+						DstAddr:       test.nsDst,
+					})
+
+					invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
 
-			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			// Invalid count should initially be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					if test.nsInvalid {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
 
-			if test.nsInvalid {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+						if p, got := e.Read(); got {
+							t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+						}
 
-				if p, got := e.Read(); got {
-					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
-				}
+						// If we expected the NS to be invalid, we have nothing else to check.
+						return
+					}
 
-				// If we expected the NS to be invalid, we have nothing else to check.
-				return
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
+					if test.performsLinkResolution {
+						p, got := e.ReadContext(context.Background())
+						if !got {
+							t.Fatal("expected an NDP NS response")
+						}
+
+						if p.Route.LocalAddress != nicAddr {
+							t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, nicAddr)
+						}
+						if p.Route.LocalLinkAddress != nicLinkAddr {
+							t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+						}
+						respNSDst := header.SolicitedNodeAddr(test.nsSrc)
+						if p.Route.RemoteAddress != respNSDst {
+							t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, respNSDst)
+						}
+						if want := header.EthernetAddressFromMulticastIPv6Address(respNSDst); p.Route.RemoteLinkAddress != want {
+							t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, want)
+						}
+
+						checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+							checker.SrcAddr(nicAddr),
+							checker.DstAddr(respNSDst),
+							checker.TTL(header.NDPHopLimit),
+							checker.NDPNS(
+								checker.NDPNSTargetAddress(test.nsSrc),
+								checker.NDPNSOptions([]header.NDPOption{
+									header.NDPSourceLinkLayerAddressOption(nicLinkAddr),
+								}),
+							))
+
+						ser := header.NDPOptionsSerializer{
+							header.NDPTargetLinkLayerAddressOption(linkAddr1),
+						}
+						ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + ser.Length()
+						hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+						pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+						pkt.SetType(header.ICMPv6NeighborAdvert)
+						na := header.NDPNeighborAdvert(pkt.NDPPayload())
+						na.SetSolicitedFlag(true)
+						na.SetOverrideFlag(true)
+						na.SetTargetAddress(test.nsSrc)
+						na.Options().Serialize(ser)
+						pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, nicAddr, buffer.VectorisedView{}))
+						payloadLength := hdr.UsedLength()
+						ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+						ip.Encode(&header.IPv6Fields{
+							PayloadLength: uint16(payloadLength),
+							NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+							HopLimit:      header.NDPHopLimit,
+							SrcAddr:       test.nsSrc,
+							DstAddr:       nicAddr,
+						})
+						e.InjectLinkAddr(ProtocolNumber, "", stack.NewPacketBuffer(stack.PacketBufferOptions{
+							Data: hdr.View().ToVectorisedView(),
+						}))
+					}
 
-			p, got := e.Read()
-			if !got {
-				t.Fatal("expected an NDP NA response")
-			}
+					p, got := e.ReadContext(context.Background())
+					if !got {
+						t.Fatal("expected an NDP NA response")
+					}
 
-			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
-				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
-			}
+					if p.Route.LocalAddress != test.naSrc {
+						t.Errorf("got p.Route.LocalAddress = %s, want = %s", p.Route.LocalAddress, test.naSrc)
+					}
+					if p.Route.LocalLinkAddress != nicLinkAddr {
+						t.Errorf("p.Route.LocalLinkAddress = %s, want = %s", p.Route.LocalLinkAddress, nicLinkAddr)
+					}
+					if p.Route.RemoteAddress != test.naDst {
+						t.Errorf("got p.Route.RemoteAddress = %s, want = %s", p.Route.RemoteAddress, test.naDst)
+					}
+					if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+						t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+					}
 
-			checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
-				checker.SrcAddr(test.naSrc),
-				checker.DstAddr(test.naDst),
-				checker.TTL(header.NDPHopLimit),
-				checker.NDPNA(
-					checker.NDPNASolicitedFlag(test.naSolicited),
-					checker.NDPNATargetAddress(nicAddr),
-					checker.NDPNAOptions([]header.NDPOption{
-						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
-					}),
-				))
+					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
+						checker.SrcAddr(test.naSrc),
+						checker.DstAddr(test.naDst),
+						checker.TTL(header.NDPHopLimit),
+						checker.NDPNA(
+							checker.NDPNASolicitedFlag(test.naSolicited),
+							checker.NDPNATargetAddress(nicAddr),
+							checker.NDPNAOptions([]header.NDPOption{
+								header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+							}),
+						))
+				})
+			}
 		})
 	}
 }
@@ -458,7 +749,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
 			})
 			e := channel.New(0, 1280, linkAddr0)
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -532,197 +823,380 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	}
 }
 
-func TestNDPValidation(t *testing.T) {
-	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
-		t.Helper()
-
-		// Create a stack with the assigned link-local address lladdr0
-		// and an endpoint to lladdr1.
-		s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1)
-
-		r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
-		if err != nil {
-			t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
-		}
-
-		return s, ep, r
-	}
-
-	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
-		nextHdr := uint8(header.ICMPv6ProtocolNumber)
-		var extensions buffer.View
-		if atomicFragment {
-			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
-			extensions[0] = nextHdr
-			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
-		}
-
-		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
-			Data:               payload.ToVectorisedView(),
-		})
-		ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
-		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(len(payload) + len(extensions)),
-			NextHeader:    nextHdr,
-			HopLimit:      hopLimit,
-			SrcAddr:       r.LocalAddress,
-			DstAddr:       r.RemoteAddress,
-		})
-		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
-			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
-		}
-		ep.HandlePacket(r, pkt)
-	}
-
-	var tllData [header.NDPLinkLayerAddressSize]byte
-	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
-		header.NDPTargetLinkLayerAddressOption(linkAddr1),
-	})
+// TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache tests
+// that receiving a valid NDP NA message with the Target Link Layer Address
+// option does not result in a new entry in the neighbor cache for the target
+// of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOptionUsingNeighborCache(t *testing.T) {
+	const nicID = 1
 
-	types := []struct {
-		name        string
-		typ         header.ICMPv6Type
-		size        int
-		extraData   []byte
-		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	tests := []struct {
+		name    string
+		optsBuf []byte
+		isValid bool
 	}{
 		{
-			name: "RouterSolicit",
-			typ:  header.ICMPv6RouterSolicit,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterSolicit
-			},
-		},
-		{
-			name: "RouterAdvert",
-			typ:  header.ICMPv6RouterAdvert,
-			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RouterAdvert
-			},
+			name:    "Valid",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			isValid: true,
 		},
 		{
-			name: "NeighborSolicit",
-			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborSolicit
-			},
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
 		},
 		{
-			name:      "NeighborAdvert",
-			typ:       header.ICMPv6NeighborAdvert,
-			size:      header.ICMPv6NeighborAdvertMinimumSize,
-			extraData: tllData[:],
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.NeighborAdvert
-			},
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
 		},
 		{
-			name: "RedirectMsg",
-			typ:  header.ICMPv6RedirectMsg,
-			size: header.ICMPv6MinimumSize,
-			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
-				return stats.RedirectMsg
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
 			},
 		},
 	}
 
-	subTests := []struct {
-		name           string
-		atomicFragment bool
-		hopLimit       uint8
-		code           header.ICMPv6Code
-		valid          bool
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+				UseNeighborCache: true,
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			neighbors, err := s.Neighbors(nicID)
+			if err != nil {
+				t.Fatalf("s.Neighbors(%d): %s", nicID, err)
+			}
+
+			neighborByAddr := make(map[tcpip.Address]stack.NeighborEntry)
+			for _, n := range neighbors {
+				if existing, ok := neighborByAddr[n.Addr]; ok {
+					if diff := cmp.Diff(existing, n); diff != "" {
+						t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry (-existing +got):\n%s", nicID, diff)
+					}
+					t.Fatalf("s.Neighbors(%d) returned unexpected duplicate neighbor entry: %s", nicID, existing)
+				}
+				neighborByAddr[n.Addr] = n
+			}
+
+			if neigh, ok := neighborByAddr[lladdr1]; ok {
+				t.Fatalf("unexpectedly got neighbor entry: %s", neigh)
+			}
+
+			if test.isValid {
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+func TestNDPValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
 	}{
 		{
-			name:           "Valid",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          true,
+			name:             "linkAddrCache",
+			useNeighborCache: false,
 		},
 		{
-			name:           "Fragmented",
-			atomicFragment: true,
-			hopLimit:       header.NDPHopLimit,
-			code:           0,
-			valid:          false,
-		},
-		{
-			name:           "Invalid hop limit",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit - 1,
-			code:           0,
-			valid:          false,
-		},
-		{
-			name:           "Invalid ICMPv6 code",
-			atomicFragment: false,
-			hopLimit:       header.NDPHopLimit,
-			code:           1,
-			valid:          false,
+			name:             "neighborCache",
+			useNeighborCache: true,
 		},
 	}
 
-	for _, typ := range types {
-		t.Run(typ.name, func(t *testing.T) {
-			for _, test := range subTests {
-				t.Run(test.name, func(t *testing.T) {
-					s, ep, r := setup(t)
-					defer r.Release()
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
+				t.Helper()
 
-					stats := s.Stats().ICMP.V6PacketsReceived
-					invalid := stats.Invalid
-					typStat := typ.statCounter(stats)
+				// Create a stack with the assigned link-local address lladdr0
+				// and an endpoint to lladdr1.
+				s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1, stackTyp.useNeighborCache)
 
-					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
-					copy(icmp[typ.size:], typ.extraData)
-					icmp.SetType(typ.typ)
-					icmp.SetCode(test.code)
-					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+				r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+				}
 
-					// Rx count of the NDP message should initially be 0.
-					if got := typStat.Value(); got != 0 {
-						t.Errorf("got %s = %d, want = 0", typ.name, got)
-					}
+				return s, ep, r
+			}
 
-					// Invalid count should initially be 0.
-					if got := invalid.Value(); got != 0 {
-						t.Errorf("got invalid = %d, want = 0", got)
-					}
+			handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+				nextHdr := uint8(header.ICMPv6ProtocolNumber)
+				var extensions buffer.View
+				if atomicFragment {
+					extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+					extensions[0] = nextHdr
+					nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+				}
 
-					if t.Failed() {
-						t.FailNow()
-					}
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
+					Data:               payload.ToVectorisedView(),
+				})
+				ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(payload) + len(extensions)),
+					NextHeader:    nextHdr,
+					HopLimit:      hopLimit,
+					SrcAddr:       r.LocalAddress,
+					DstAddr:       r.RemoteAddress,
+				})
+				if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+					t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+				}
+				ep.HandlePacket(r, pkt)
+			}
 
-					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+			var tllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPTargetLinkLayerAddressOption(linkAddr1),
+			})
 
-					// Rx count of the NDP packet should have increased.
-					if got := typStat.Value(); got != 1 {
-						t.Errorf("got %s = %d, want = 1", typ.name, got)
-					}
+			var sllData [header.NDPLinkLayerAddressSize]byte
+			header.NDPOptions(sllData[:]).Serialize(header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			})
 
-					want := uint64(0)
-					if !test.valid {
-						// Invalid count should have increased.
-						want = 1
-					}
-					if got := invalid.Value(); got != want {
-						t.Errorf("got invalid = %d, want = %d", got, want)
+			types := []struct {
+				name        string
+				typ         header.ICMPv6Type
+				size        int
+				extraData   []byte
+				statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+				routerOnly  bool
+			}{
+				{
+					name: "RouterSolicit",
+					typ:  header.ICMPv6RouterSolicit,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterSolicit
+					},
+					routerOnly: true,
+				},
+				{
+					name: "RouterAdvert",
+					typ:  header.ICMPv6RouterAdvert,
+					size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RouterAdvert
+					},
+				},
+				{
+					name:      "NeighborSolicit",
+					typ:       header.ICMPv6NeighborSolicit,
+					size:      header.ICMPv6NeighborSolicitMinimumSize,
+					extraData: sllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborSolicit
+					},
+				},
+				{
+					name:      "NeighborAdvert",
+					typ:       header.ICMPv6NeighborAdvert,
+					size:      header.ICMPv6NeighborAdvertMinimumSize,
+					extraData: tllData[:],
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.NeighborAdvert
+					},
+				},
+				{
+					name: "RedirectMsg",
+					typ:  header.ICMPv6RedirectMsg,
+					size: header.ICMPv6MinimumSize,
+					statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+						return stats.RedirectMsg
+					},
+				},
+			}
+
+			subTests := []struct {
+				name           string
+				atomicFragment bool
+				hopLimit       uint8
+				code           header.ICMPv6Code
+				valid          bool
+			}{
+				{
+					name:           "Valid",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          true,
+				},
+				{
+					name:           "Fragmented",
+					atomicFragment: true,
+					hopLimit:       header.NDPHopLimit,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid hop limit",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit - 1,
+					code:           0,
+					valid:          false,
+				},
+				{
+					name:           "Invalid ICMPv6 code",
+					atomicFragment: false,
+					hopLimit:       header.NDPHopLimit,
+					code:           1,
+					valid:          false,
+				},
+			}
+
+			for _, typ := range types {
+				for _, isRouter := range []bool{false, true} {
+					name := typ.name
+					if isRouter {
+						name += " (Router)"
 					}
-				})
+
+					t.Run(name, func(t *testing.T) {
+						for _, test := range subTests {
+							t.Run(test.name, func(t *testing.T) {
+								s, ep, r := setup(t)
+								defer r.Release()
+
+								if isRouter {
+									// Enabling forwarding makes the stack act as a router.
+									s.SetForwarding(ProtocolNumber, true)
+								}
+
+								stats := s.Stats().ICMP.V6PacketsReceived
+								invalid := stats.Invalid
+								routerOnly := stats.RouterOnlyPacketsDroppedByHost
+								typStat := typ.statCounter(stats)
+
+								icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+								copy(icmp[typ.size:], typ.extraData)
+								icmp.SetType(typ.typ)
+								icmp.SetCode(test.code)
+								icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+
+								// Rx count of the NDP message should initially be 0.
+								if got := typStat.Value(); got != 0 {
+									t.Errorf("got %s = %d, want = 0", typ.name, got)
+								}
+
+								// Invalid count should initially be 0.
+								if got := invalid.Value(); got != 0 {
+									t.Errorf("got invalid = %d, want = 0", got)
+								}
+
+								// RouterOnlyPacketsReceivedByHost count should initially be 0.
+								if got := routerOnly.Value(); got != 0 {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = 0", got)
+								}
+
+								if t.Failed() {
+									t.FailNow()
+								}
+
+								handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+
+								// Rx count of the NDP packet should have increased.
+								if got := typStat.Value(); got != 1 {
+									t.Errorf("got %s = %d, want = 1", typ.name, got)
+								}
+
+								want := uint64(0)
+								if !test.valid {
+									// Invalid count should have increased.
+									want = 1
+								}
+								if got := invalid.Value(); got != want {
+									t.Errorf("got invalid = %d, want = %d", got, want)
+								}
+
+								want = 0
+								if test.valid && !isRouter && typ.routerOnly {
+									// RouterOnlyPacketsReceivedByHost count should have increased.
+									want = 1
+								}
+								if got := routerOnly.Value(); got != want {
+									t.Errorf("got RouterOnlyPacketsReceivedByHost = %d, want = %d", got, want)
+								}
+
+							})
+						}
+					})
+				}
 			}
 		})
 	}
+
 }
 
 // TestRouterAdvertValidation tests that when the NIC is configured to handle
 // NDP Router Advertisement packets, it validates the Router Advertisement
 // properly before handling them.
 func TestRouterAdvertValidation(t *testing.T) {
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
 	tests := []struct {
 		name            string
 		src             tcpip.Address
@@ -844,61 +1318,67 @@ func TestRouterAdvertValidation(t *testing.T) {
 		},
 	}
 
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			e := channel.New(10, 1280, linkAddr1)
-			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
-			})
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			for _, test := range tests {
+				t.Run(test.name, func(t *testing.T) {
+					e := channel.New(10, 1280, linkAddr1)
+					e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+					s := stack.New(stack.Options{
+						NetworkProtocols: []stack.NetworkProtocolFactory{NewProtocol},
+						UseNeighborCache: stackTyp.useNeighborCache,
+					})
+
+					if err := s.CreateNIC(1, e); err != nil {
+						t.Fatalf("CreateNIC(_) = %s", err)
+					}
 
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
-			}
+					icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
+					hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+					pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+					pkt.SetType(header.ICMPv6RouterAdvert)
+					pkt.SetCode(test.code)
+					copy(pkt.NDPPayload(), test.ndpPayload)
+					payloadLength := hdr.UsedLength()
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+					ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+					ip.Encode(&header.IPv6Fields{
+						PayloadLength: uint16(payloadLength),
+						NextHeader:    uint8(icmp.ProtocolNumber6),
+						HopLimit:      test.hopLimit,
+						SrcAddr:       test.src,
+						DstAddr:       header.IPv6AllNodesMulticastAddress,
+					})
 
-			icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
-			pkt := header.ICMPv6(hdr.Prepend(icmpSize))
-			pkt.SetType(header.ICMPv6RouterAdvert)
-			pkt.SetCode(test.code)
-			copy(pkt.NDPPayload(), test.ndpPayload)
-			payloadLength := hdr.UsedLength()
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
-			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-			ip.Encode(&header.IPv6Fields{
-				PayloadLength: uint16(payloadLength),
-				NextHeader:    uint8(icmp.ProtocolNumber6),
-				HopLimit:      test.hopLimit,
-				SrcAddr:       test.src,
-				DstAddr:       header.IPv6AllNodesMulticastAddress,
-			})
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			rxRA := stats.RouterAdvert
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					rxRA := stats.RouterAdvert
 
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-			if got := rxRA.Value(); got != 0 {
-				t.Fatalf("got rxRA = %d, want = 0", got)
-			}
+					if got := invalid.Value(); got != 0 {
+						t.Fatalf("got invalid = %d, want = 0", got)
+					}
+					if got := rxRA.Value(); got != 0 {
+						t.Fatalf("got rxRA = %d, want = 0", got)
+					}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			}))
+					e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+						Data: hdr.View().ToVectorisedView(),
+					}))
 
-			if got := rxRA.Value(); got != 1 {
-				t.Fatalf("got rxRA = %d, want = 1", got)
-			}
+					if got := rxRA.Value(); got != 1 {
+						t.Fatalf("got rxRA = %d, want = 1", got)
+					}
 
-			if test.expectedSuccess {
-				if got := invalid.Value(); got != 0 {
-					t.Fatalf("got invalid = %d, want = 0", got)
-				}
-			} else {
-				if got := invalid.Value(); got != 1 {
-					t.Fatalf("got invalid = %d, want = 1", got)
-				}
+					if test.expectedSuccess {
+						if got := invalid.Value(); got != 0 {
+							t.Fatalf("got invalid = %d, want = 0", got)
+						}
+					} else {
+						if got := invalid.Value(); got != 1 {
+							t.Fatalf("got invalid = %d, want = 1", got)
+						}
+					}
+				})
 			}
 		})
 	}
diff --git a/pkg/tcpip/network/testutil/BUILD b/pkg/tcpip/network/testutil/BUILD
new file mode 100644
index 000000000..d0ffc299a
--- /dev/null
+++ b/pkg/tcpip/network/testutil/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "testutil",
+    srcs = [
+        "testutil.go",
+    ],
+    visibility = [
+        "//pkg/tcpip/network/fragmentation:__pkg__",
+        "//pkg/tcpip/network/ipv4:__pkg__",
+        "//pkg/tcpip/network/ipv6:__pkg__",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/network/testutil/testutil.go b/pkg/tcpip/network/testutil/testutil.go
new file mode 100644
index 000000000..7cc52985e
--- /dev/null
+++ b/pkg/tcpip/network/testutil/testutil.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil defines types and functions used to test Network Layer
+// functionality such as IP fragmentation.
+package testutil
+
+import (
+	"fmt"
+	"math/rand"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// MockLinkEndpoint is an endpoint used for testing, it stores packets written
+// to it and can mock errors.
+type MockLinkEndpoint struct {
+	// WrittenPackets is where packets written to the endpoint are stored.
+	WrittenPackets []*stack.PacketBuffer
+
+	mtu          uint32
+	err          *tcpip.Error
+	allowPackets int
+}
+
+// NewMockLinkEndpoint creates a new MockLinkEndpoint.
+//
+// err is the error that will be returned once allowPackets packets are written
+// to the endpoint.
+func NewMockLinkEndpoint(mtu uint32, err *tcpip.Error, allowPackets int) *MockLinkEndpoint {
+	return &MockLinkEndpoint{
+		mtu:          mtu,
+		err:          err,
+		allowPackets: allowPackets,
+	}
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (ep *MockLinkEndpoint) MTU() uint32 { return ep.mtu }
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*MockLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities { return 0 }
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*MockLinkEndpoint) MaxHeaderLength() uint16 { return 0 }
+
+// LinkAddress implements LinkEndpoint.LinkAddress.
+func (*MockLinkEndpoint) LinkAddress() tcpip.LinkAddress { return "" }
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (ep *MockLinkEndpoint) WritePacket(_ *stack.Route, _ *stack.GSO, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if ep.allowPackets == 0 {
+		return ep.err
+	}
+	ep.allowPackets--
+	ep.WrittenPackets = append(ep.WrittenPackets, pkt)
+	return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (ep *MockLinkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	var n int
+
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if err := ep.WritePacket(r, gso, protocol, pkt); err != nil {
+			return n, err
+		}
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (ep *MockLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	if ep.allowPackets == 0 {
+		return ep.err
+	}
+	ep.allowPackets--
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: vv,
+	})
+	ep.WrittenPackets = append(ep.WrittenPackets, pkt)
+
+	return nil
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (*MockLinkEndpoint) Attach(stack.NetworkDispatcher) {}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (*MockLinkEndpoint) IsAttached() bool { return false }
+
+// Wait implements LinkEndpoint.Wait.
+func (*MockLinkEndpoint) Wait() {}
+
+// ARPHardwareType implements LinkEndpoint.ARPHardwareType.
+func (*MockLinkEndpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone }
+
+// AddHeader implements LinkEndpoint.AddHeader.
+func (*MockLinkEndpoint) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
+
+// MakeRandPkt generates a randomized packet. transportHeaderLength indicates
+// how many random bytes will be copied in the Transport Header.
+// extraHeaderReserveLength indicates how much extra space will be reserved for
+// the other headers. The payload is made from Views of the sizes listed in
+// viewSizes.
+func MakeRandPkt(transportHeaderLength int, extraHeaderReserveLength int, viewSizes []int, proto tcpip.NetworkProtocolNumber) *stack.PacketBuffer {
+	var views buffer.VectorisedView
+
+	for _, s := range viewSizes {
+		newView := buffer.NewView(s)
+		if _, err := rand.Read(newView); err != nil {
+			panic(fmt.Sprintf("rand.Read: %s", err))
+		}
+		views.AppendView(newView)
+	}
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: transportHeaderLength + extraHeaderReserveLength,
+		Data:               views,
+	})
+	pkt.NetworkProtocolNumber = proto
+	if _, err := rand.Read(pkt.TransportHeader().Push(transportHeaderLength)); err != nil {
+		panic(fmt.Sprintf("rand.Read: %s", err))
+	}
+	return pkt
+}
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index f6d592eb5..d87193650 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -400,7 +400,11 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) (reservedPort uint16, err *tcpip.Error) {
+//
+// An optional testPort closure can be passed in which if provided will be used
+// to test if the picked port can be used. The function should return true if
+// the port is safe to use, false otherwise.
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress, testPort func(port uint16) bool) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -412,12 +416,23 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice, dst) {
 			return 0, tcpip.ErrPortInUse
 		}
+		if testPort != nil && !testPort(port) {
+			s.releasePortLocked(networks, transport, addr, port, flags.Bits(), bindToDevice, dst)
+			return 0, tcpip.ErrPortInUse
+		}
 		return port, nil
 	}
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst), nil
+		if !s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst) {
+			return false, nil
+		}
+		if testPort != nil && !testPort(p) {
+			s.releasePortLocked(networks, transport, addr, p, flags.Bits(), bindToDevice, dst)
+			return false, nil
+		}
+		return true, nil
 	})
 }
 
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 58db5868c..4bc949fd8 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -332,7 +332,7 @@ func TestPortReservation(t *testing.T) {
 					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest, nil /* testPort */)
 				if err != test.want {
 					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d, %v) = %v, want %v", test.ip, test.port, test.flags, test.device, test.dest, err, test.want)
 				}
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 0ab089208..51d428049 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -127,8 +127,8 @@ func main() {
 	// Create the stack with ipv4 and tcp protocols, then add a tun-based
 	// NIC and ipv4 address.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	mtu, err := rawfile.GetMTU(tunName)
@@ -182,7 +182,7 @@ func main() {
 	if terr == tcpip.ErrConnectStarted {
 		fmt.Println("Connect is pending...")
 		<-notifyCh
-		terr = ep.GetSockOpt(tcpip.ErrorOption{})
+		terr = ep.LastError()
 	}
 	wq.EventUnregister(&waitEntry)
 
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 9e37cab18..8e0ee1cd7 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -112,8 +112,8 @@ func main() {
 	// Create the stack with ip and tcp protocols, then add a tun-based
 	// NIC and address.
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	mtu, err := rawfile.GetMTU(tunName)
@@ -188,7 +188,7 @@ func main() {
 	defer wq.EventUnregister(&waitEntry)
 
 	for {
-		n, wq, err := ep.Accept()
+		n, wq, err := ep.Accept(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				<-notifyCh
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 900938dd1..d09ebe7fa 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -54,9 +54,8 @@ go_template_instance(
 go_library(
     name = "stack",
     srcs = [
+        "addressable_endpoint_state.go",
         "conntrack.go",
-        "dhcpv6configurationfromndpra_string.go",
-        "forwarder.go",
         "headertype_string.go",
         "icmp_rate_limit.go",
         "iptables.go",
@@ -65,7 +64,6 @@ go_library(
         "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
-        "ndp.go",
         "neighbor_cache.go",
         "neighbor_entry.go",
         "neighbor_entry_list.go",
@@ -74,6 +72,7 @@ go_library(
         "nud.go",
         "packet_buffer.go",
         "packet_buffer_list.go",
+        "pending_packets.go",
         "rand.go",
         "registration.go",
         "route.go",
@@ -106,6 +105,7 @@ go_test(
     name = "stack_x_test",
     size = "medium",
     srcs = [
+        "addressable_endpoint_state_test.go",
         "ndp_test.go",
         "nud_test.go",
         "stack_test.go",
@@ -116,6 +116,7 @@ go_test(
     deps = [
         ":stack",
         "//pkg/rand",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
@@ -138,8 +139,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = [
-        "fake_time_test.go",
-        "forwarder_test.go",
+        "forwarding_test.go",
         "linkaddrcache_test.go",
         "neighbor_cache_test.go",
         "neighbor_entry_test.go",
@@ -152,8 +152,8 @@ go_test(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/faketime",
         "//pkg/tcpip/header",
-        "@com_github_dpjacques_clockwork//:go_default_library",
         "@com_github_google_go_cmp//cmp:go_default_library",
         "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
diff --git a/pkg/tcpip/stack/addressable_endpoint_state.go b/pkg/tcpip/stack/addressable_endpoint_state.go
new file mode 100644
index 000000000..261705575
--- /dev/null
+++ b/pkg/tcpip/stack/addressable_endpoint_state.go
@@ -0,0 +1,755 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+var _ GroupAddressableEndpoint = (*AddressableEndpointState)(nil)
+var _ AddressableEndpoint = (*AddressableEndpointState)(nil)
+
+// AddressableEndpointState is an implementation of an AddressableEndpoint.
+type AddressableEndpointState struct {
+	networkEndpoint NetworkEndpoint
+
+	// Lock ordering (from outer to inner lock ordering):
+	//
+	// AddressableEndpointState.mu
+	//   addressState.mu
+	mu struct {
+		sync.RWMutex
+
+		endpoints map[tcpip.Address]*addressState
+		primary   []*addressState
+
+		// groups holds the mapping between group addresses and the number of times
+		// they have been joined.
+		groups map[tcpip.Address]uint32
+	}
+}
+
+// Init initializes the AddressableEndpointState with networkEndpoint.
+//
+// Must be called before calling any other function on m.
+func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint) {
+	a.networkEndpoint = networkEndpoint
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.endpoints = make(map[tcpip.Address]*addressState)
+	a.mu.groups = make(map[tcpip.Address]uint32)
+}
+
+// ReadOnlyAddressableEndpointState provides read-only access to an
+// AddressableEndpointState.
+type ReadOnlyAddressableEndpointState struct {
+	inner *AddressableEndpointState
+}
+
+// AddrOrMatching returns an endpoint for the passed address that is consisdered
+// bound to the wrapped AddressableEndpointState.
+//
+// If addr is an exact match with an existing address, that address is returned.
+// Otherwise, f is called with each address and the address that f returns true
+// for is returned.
+//
+// Returns nil of no address matches.
+func (m ReadOnlyAddressableEndpointState) AddrOrMatching(addr tcpip.Address, spoofingOrPrimiscuous bool, f func(AddressEndpoint) bool) AddressEndpoint {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	if ep, ok := m.inner.mu.endpoints[addr]; ok {
+		if ep.IsAssigned(spoofingOrPrimiscuous) && ep.IncRef() {
+			return ep
+		}
+	}
+
+	for _, ep := range m.inner.mu.endpoints {
+		if ep.IsAssigned(spoofingOrPrimiscuous) && f(ep) && ep.IncRef() {
+			return ep
+		}
+	}
+
+	return nil
+}
+
+// Lookup returns the AddressEndpoint for the passed address.
+//
+// Returns nil if the passed address is not associated with the
+// AddressableEndpointState.
+func (m ReadOnlyAddressableEndpointState) Lookup(addr tcpip.Address) AddressEndpoint {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	ep, ok := m.inner.mu.endpoints[addr]
+	if !ok {
+		return nil
+	}
+	return ep
+}
+
+// ForEach calls f for each address pair.
+//
+// If f returns false, f is no longer be called.
+func (m ReadOnlyAddressableEndpointState) ForEach(f func(AddressEndpoint) bool) {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+
+	for _, ep := range m.inner.mu.endpoints {
+		if !f(ep) {
+			return
+		}
+	}
+}
+
+// ForEachPrimaryEndpoint calls f for each primary address.
+//
+// If f returns false, f is no longer be called.
+func (m ReadOnlyAddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint)) {
+	m.inner.mu.RLock()
+	defer m.inner.mu.RUnlock()
+	for _, ep := range m.inner.mu.primary {
+		f(ep)
+	}
+}
+
+// ReadOnly returns a readonly reference to a.
+func (a *AddressableEndpointState) ReadOnly() ReadOnlyAddressableEndpointState {
+	return ReadOnlyAddressableEndpointState{inner: a}
+}
+
+func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.releaseAddressStateLocked(addrState)
+}
+
+// releaseAddressState removes addrState from s's address state (primary and endpoints list).
+//
+// Preconditions: a.mu must be write locked.
+func (a *AddressableEndpointState) releaseAddressStateLocked(addrState *addressState) {
+	oldPrimary := a.mu.primary
+	for i, s := range a.mu.primary {
+		if s == addrState {
+			a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+			oldPrimary[len(oldPrimary)-1] = nil
+			break
+		}
+	}
+	delete(a.mu.endpoints, addrState.addr.Address)
+}
+
+// AddAndAcquirePermanentAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ep, err := a.addAndAcquireAddressLocked(addr, peb, configType, deprecated, true /* permanent */)
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil, err
+	}
+	return ep, err
+}
+
+// AddAndAcquireTemporaryAddress adds a temporary address.
+//
+// Returns tcpip.ErrDuplicateAddress if the address exists.
+//
+// The temporary address's endpoint is acquired and returned.
+func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior) (AddressEndpoint, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ep, err := a.addAndAcquireAddressLocked(addr, peb, AddressConfigStatic, false /* deprecated */, false /* permanent */)
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil, err
+	}
+	return ep, err
+}
+
+// addAndAcquireAddressLocked adds, acquires and returns a permanent or
+// temporary address.
+//
+// If the addressable endpoint already has the address in a non-permanent state,
+// and addAndAcquireAddressLocked is adding a permanent address, that address is
+// promoted in place and its properties set to the properties provided. If the
+// address already exists in any other state, then tcpip.ErrDuplicateAddress is
+// returned, regardless the kind of address that is being added.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated, permanent bool) (*addressState, *tcpip.Error) {
+	// attemptAddToPrimary is false when the address is already in the primary
+	// address list.
+	attemptAddToPrimary := true
+	addrState, ok := a.mu.endpoints[addr.Address]
+	if ok {
+		if !permanent {
+			// We are adding a non-permanent address but the address exists. No need
+			// to go any further since we can only promote existing temporary/expired
+			// addresses to permanent.
+			return nil, tcpip.ErrDuplicateAddress
+		}
+
+		addrState.mu.Lock()
+		if addrState.mu.kind.IsPermanent() {
+			addrState.mu.Unlock()
+			// We are adding a permanent address but a permanent address already
+			// exists.
+			return nil, tcpip.ErrDuplicateAddress
+		}
+
+		if addrState.mu.refs == 0 {
+			panic(fmt.Sprintf("found an address that should have been released (ref count == 0); address = %s", addrState.addr))
+		}
+
+		// We now promote the address.
+		for i, s := range a.mu.primary {
+			if s == addrState {
+				switch peb {
+				case CanBePrimaryEndpoint:
+					// The address is already in the primary address list.
+					attemptAddToPrimary = false
+				case FirstPrimaryEndpoint:
+					if i == 0 {
+						// The address is already first in the primary address list.
+						attemptAddToPrimary = false
+					} else {
+						a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+					}
+				case NeverPrimaryEndpoint:
+					a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
+				default:
+					panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
+				}
+				break
+			}
+		}
+	}
+
+	if addrState == nil {
+		addrState = &addressState{
+			addressableEndpointState: a,
+			addr:                     addr,
+		}
+		a.mu.endpoints[addr.Address] = addrState
+		addrState.mu.Lock()
+		// We never promote an address to temporary - it can only be added as such.
+		// If we are actaully adding a permanent address, it is promoted below.
+		addrState.mu.kind = Temporary
+	}
+
+	// At this point we have an address we are either promoting from an expired or
+	// temporary address to permanent, promoting an expired address to temporary,
+	// or we are adding a new temporary or permanent address.
+	//
+	// The address MUST be write locked at this point.
+	defer addrState.mu.Unlock()
+
+	if permanent {
+		if addrState.mu.kind.IsPermanent() {
+			panic(fmt.Sprintf("only non-permanent addresses should be promoted to permanent; address = %s", addrState.addr))
+		}
+
+		// Primary addresses are biased by 1.
+		addrState.mu.refs++
+		addrState.mu.kind = Permanent
+	}
+	// Acquire the address before returning it.
+	addrState.mu.refs++
+	addrState.mu.deprecated = deprecated
+	addrState.mu.configType = configType
+
+	if attemptAddToPrimary {
+		switch peb {
+		case NeverPrimaryEndpoint:
+		case CanBePrimaryEndpoint:
+			a.mu.primary = append(a.mu.primary, addrState)
+		case FirstPrimaryEndpoint:
+			if cap(a.mu.primary) == len(a.mu.primary) {
+				a.mu.primary = append([]*addressState{addrState}, a.mu.primary...)
+			} else {
+				// Shift all the endpoints by 1 to make room for the new address at the
+				// front. We could have just created a new slice but this saves
+				// allocations when the slice has capacity for the new address.
+				primaryCount := len(a.mu.primary)
+				a.mu.primary = append(a.mu.primary, nil)
+				if n := copy(a.mu.primary[1:], a.mu.primary); n != primaryCount {
+					panic(fmt.Sprintf("copied %d elements; expected = %d elements", n, primaryCount))
+				}
+				a.mu.primary[0] = addrState
+			}
+		default:
+			panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
+		}
+	}
+
+	return addrState, nil
+}
+
+// RemovePermanentAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) *tcpip.Error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if _, ok := a.mu.groups[addr]; ok {
+		panic(fmt.Sprintf("group address = %s must be removed with LeaveGroup", addr))
+	}
+
+	return a.removePermanentAddressLocked(addr)
+}
+
+// removePermanentAddressLocked is like RemovePermanentAddress but with locking
+// requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
+	addrState, ok := a.mu.endpoints[addr]
+	if !ok {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	return a.removePermanentEndpointLocked(addrState)
+}
+
+// RemovePermanentEndpoint removes the passed endpoint if it is associated with
+// a and permanent.
+func (a *AddressableEndpointState) RemovePermanentEndpoint(ep AddressEndpoint) *tcpip.Error {
+	addrState, ok := ep.(*addressState)
+	if !ok || addrState.addressableEndpointState != a {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.removePermanentEndpointLocked(addrState)
+}
+
+// removePermanentAddressLocked is like RemovePermanentAddress but with locking
+// requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) removePermanentEndpointLocked(addrState *addressState) *tcpip.Error {
+	if !addrState.GetKind().IsPermanent() {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	addrState.SetKind(PermanentExpired)
+	a.decAddressRefLocked(addrState)
+	return nil
+}
+
+// decAddressRef decrements the address's reference count and releases it once
+// the reference count hits 0.
+func (a *AddressableEndpointState) decAddressRef(addrState *addressState) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.decAddressRefLocked(addrState)
+}
+
+// decAddressRefLocked is like decAddressRef but with locking requirements.
+//
+// Precondition: a.mu must be write locked.
+func (a *AddressableEndpointState) decAddressRefLocked(addrState *addressState) {
+	addrState.mu.Lock()
+	defer addrState.mu.Unlock()
+
+	if addrState.mu.refs == 0 {
+		panic(fmt.Sprintf("attempted to decrease ref count for AddressEndpoint w/ addr = %s when it is already released", addrState.addr))
+	}
+
+	addrState.mu.refs--
+
+	if addrState.mu.refs != 0 {
+		return
+	}
+
+	// A non-expired permanent address must not have its reference count dropped
+	// to 0.
+	if addrState.mu.kind.IsPermanent() {
+		panic(fmt.Sprintf("permanent addresses should be removed through the AddressableEndpoint: addr = %s, kind = %d", addrState.addr, addrState.mu.kind))
+	}
+
+	a.releaseAddressStateLocked(addrState)
+}
+
+// MainAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) MainAddress() tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
+		return ep.GetKind() == Permanent
+	})
+	if ep == nil {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	addr := ep.AddressWithPrefix()
+	a.decAddressRefLocked(ep)
+	return addr
+}
+
+// acquirePrimaryAddressRLocked returns an acquired primary address that is
+// valid according to isValid.
+//
+// Precondition: e.mu must be read locked
+func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(isValid func(*addressState) bool) *addressState {
+	var deprecatedEndpoint *addressState
+	for _, ep := range a.mu.primary {
+		if !isValid(ep) {
+			continue
+		}
+
+		if !ep.Deprecated() {
+			if ep.IncRef() {
+				// ep is not deprecated, so return it immediately.
+				//
+				// If we kept track of a deprecated endpoint, decrement its reference
+				// count since it was incremented when we decided to keep track of it.
+				if deprecatedEndpoint != nil {
+					a.decAddressRefLocked(deprecatedEndpoint)
+					deprecatedEndpoint = nil
+				}
+
+				return ep
+			}
+		} else if deprecatedEndpoint == nil && ep.IncRef() {
+			// We prefer an endpoint that is not deprecated, but we keep track of
+			// ep in case a doesn't have any non-deprecated endpoints.
+			//
+			// If we end up finding a more preferred endpoint, ep's reference count
+			// will be decremented.
+			deprecatedEndpoint = ep
+		}
+	}
+
+	return deprecatedEndpoint
+}
+
+// AcquireAssignedAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if addrState, ok := a.mu.endpoints[localAddr]; ok {
+		if !addrState.IsAssigned(allowTemp) {
+			return nil
+		}
+
+		if !addrState.IncRef() {
+			panic(fmt.Sprintf("failed to increase the reference count for address = %s", addrState.addr))
+		}
+
+		return addrState
+	}
+
+	if !allowTemp {
+		return nil
+	}
+
+	addr := localAddr.WithPrefix()
+	ep, err := a.addAndAcquireAddressLocked(addr, tempPEB, AddressConfigStatic, false /* deprecated */, false /* permanent */)
+	if err != nil {
+		// addAndAcquireAddressLocked only returns an error if the address is
+		// already assigned but we just checked above if the address exists so we
+		// expect no error.
+		panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, %d, %d, false, false): %s", addr, tempPEB, AddressConfigStatic, err))
+	}
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil
+	}
+	return ep
+}
+
+// AcquireOutgoingPrimaryAddress implements AddressableEndpoint.
+func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
+		return ep.IsAssigned(allowExpired)
+	})
+
+	// From https://golang.org/doc/faq#nil_error:
+	//
+	// Under the covers, interfaces are implemented as two elements, a type T and
+	// a value V.
+	//
+	// An interface value is nil only if the V and T are both unset, (T=nil, V is
+	// not set), In particular, a nil interface will always hold a nil type. If we
+	// store a nil pointer of type *int inside an interface value, the inner type
+	// will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
+	// an interface value will therefore be non-nil even when the pointer value V
+	// inside is nil.
+	//
+	// Since acquirePrimaryAddressRLocked returns a nil value with a non-nil type,
+	// we need to explicitly return nil below if ep is (a typed) nil.
+	if ep == nil {
+		return nil
+	}
+
+	return ep
+}
+
+// PrimaryAddresses implements AddressableEndpoint.
+func (a *AddressableEndpointState) PrimaryAddresses() []tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	var addrs []tcpip.AddressWithPrefix
+	for _, ep := range a.mu.primary {
+		// Don't include tentative, expired or temporary endpoints
+		// to avoid confusion and prevent the caller from using
+		// those.
+		switch ep.GetKind() {
+		case PermanentTentative, PermanentExpired, Temporary:
+			continue
+		}
+
+		addrs = append(addrs, ep.AddressWithPrefix())
+	}
+
+	return addrs
+}
+
+// PermanentAddresses implements AddressableEndpoint.
+func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefix {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+
+	var addrs []tcpip.AddressWithPrefix
+	for _, ep := range a.mu.endpoints {
+		if !ep.GetKind().IsPermanent() {
+			continue
+		}
+
+		addrs = append(addrs, ep.AddressWithPrefix())
+	}
+
+	return addrs
+}
+
+// JoinGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) JoinGroup(group tcpip.Address) (bool, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	joins, ok := a.mu.groups[group]
+	if !ok {
+		ep, err := a.addAndAcquireAddressLocked(group.WithPrefix(), NeverPrimaryEndpoint, AddressConfigStatic, false /* deprecated */, true /* permanent */)
+		if err != nil {
+			return false, err
+		}
+		// We have no need for the address endpoint.
+		a.decAddressRefLocked(ep)
+	}
+
+	a.mu.groups[group] = joins + 1
+	return !ok, nil
+}
+
+// LeaveGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) LeaveGroup(group tcpip.Address) (bool, *tcpip.Error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	joins, ok := a.mu.groups[group]
+	if !ok {
+		return false, tcpip.ErrBadLocalAddress
+	}
+
+	if joins == 1 {
+		a.removeGroupAddressLocked(group)
+		delete(a.mu.groups, group)
+		return true, nil
+	}
+
+	a.mu.groups[group] = joins - 1
+	return false, nil
+}
+
+// IsInGroup implements GroupAddressableEndpoint.
+func (a *AddressableEndpointState) IsInGroup(group tcpip.Address) bool {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	_, ok := a.mu.groups[group]
+	return ok
+}
+
+func (a *AddressableEndpointState) removeGroupAddressLocked(group tcpip.Address) {
+	if err := a.removePermanentAddressLocked(group); err != nil {
+		// removePermanentEndpointLocked would only return an error if group is
+		// not bound to the addressable endpoint, but we know it MUST be assigned
+		// since we have group in our map of groups.
+		panic(fmt.Sprintf("error removing group address = %s: %s", group, err))
+	}
+}
+
+// Cleanup forcefully leaves all groups and removes all permanent addresses.
+func (a *AddressableEndpointState) Cleanup() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	for group := range a.mu.groups {
+		a.removeGroupAddressLocked(group)
+	}
+	a.mu.groups = make(map[tcpip.Address]uint32)
+
+	for _, ep := range a.mu.endpoints {
+		// removePermanentEndpointLocked returns tcpip.ErrBadLocalAddress if ep is
+		// not a permanent address.
+		if err := a.removePermanentEndpointLocked(ep); err != nil && err != tcpip.ErrBadLocalAddress {
+			panic(fmt.Sprintf("unexpected error from removePermanentEndpointLocked(%s): %s", ep.addr, err))
+		}
+	}
+}
+
+var _ AddressEndpoint = (*addressState)(nil)
+
+// addressState holds state for an address.
+type addressState struct {
+	addressableEndpointState *AddressableEndpointState
+	addr                     tcpip.AddressWithPrefix
+
+	// Lock ordering (from outer to inner lock ordering):
+	//
+	// AddressableEndpointState.mu
+	//   addressState.mu
+	mu struct {
+		sync.RWMutex
+
+		refs       uint32
+		kind       AddressKind
+		configType AddressConfigType
+		deprecated bool
+	}
+}
+
+// AddressWithPrefix implements AddressEndpoint.
+func (a *addressState) AddressWithPrefix() tcpip.AddressWithPrefix {
+	return a.addr
+}
+
+// GetKind implements AddressEndpoint.
+func (a *addressState) GetKind() AddressKind {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.kind
+}
+
+// SetKind implements AddressEndpoint.
+func (a *addressState) SetKind(kind AddressKind) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.kind = kind
+}
+
+// IsAssigned implements AddressEndpoint.
+func (a *addressState) IsAssigned(allowExpired bool) bool {
+	if !a.addressableEndpointState.networkEndpoint.Enabled() {
+		return false
+	}
+
+	switch a.GetKind() {
+	case PermanentTentative:
+		return false
+	case PermanentExpired:
+		return allowExpired
+	default:
+		return true
+	}
+}
+
+// IncRef implements AddressEndpoint.
+func (a *addressState) IncRef() bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.mu.refs == 0 {
+		return false
+	}
+
+	a.mu.refs++
+	return true
+}
+
+// DecRef implements AddressEndpoint.
+func (a *addressState) DecRef() {
+	a.addressableEndpointState.decAddressRef(a)
+}
+
+// ConfigType implements AddressEndpoint.
+func (a *addressState) ConfigType() AddressConfigType {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.configType
+}
+
+// SetDeprecated implements AddressEndpoint.
+func (a *addressState) SetDeprecated(d bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.mu.deprecated = d
+}
+
+// Deprecated implements AddressEndpoint.
+func (a *addressState) Deprecated() bool {
+	a.mu.RLock()
+	defer a.mu.RUnlock()
+	return a.mu.deprecated
+}
diff --git a/pkg/tcpip/stack/addressable_endpoint_state_test.go b/pkg/tcpip/stack/addressable_endpoint_state_test.go
new file mode 100644
index 000000000..26787d0a3
--- /dev/null
+++ b/pkg/tcpip/stack/addressable_endpoint_state_test.go
@@ -0,0 +1,77 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// TestAddressableEndpointStateCleanup tests that cleaning up an addressable
+// endpoint state removes permanent addresses and leaves groups.
+func TestAddressableEndpointStateCleanup(t *testing.T) {
+	var ep fakeNetworkEndpoint
+	if err := ep.Enable(); err != nil {
+		t.Fatalf("ep.Enable(): %s", err)
+	}
+
+	var s stack.AddressableEndpointState
+	s.Init(&ep)
+
+	addr := tcpip.AddressWithPrefix{
+		Address:   "\x01",
+		PrefixLen: 8,
+	}
+
+	{
+		ep, err := s.AddAndAcquirePermanentAddress(addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
+		if err != nil {
+			t.Fatalf("s.AddAndAcquirePermanentAddress(%s, %d, %d, false): %s", addr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, err)
+		}
+		// We don't need the address endpoint.
+		ep.DecRef()
+	}
+	{
+		ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint)
+		if ep == nil {
+			t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = nil, want = non-nil", addr.Address)
+		}
+		ep.DecRef()
+	}
+
+	group := tcpip.Address("\x02")
+	if added, err := s.JoinGroup(group); err != nil {
+		t.Fatalf("s.JoinGroup(%s): %s", group, err)
+	} else if !added {
+		t.Fatalf("got s.JoinGroup(%s) = false, want = true", group)
+	}
+	if !s.IsInGroup(group) {
+		t.Fatalf("got s.IsInGroup(%s) = false, want = true", group)
+	}
+
+	s.Cleanup()
+	{
+		ep := s.AcquireAssignedAddress(addr.Address, false /* allowTemp */, stack.NeverPrimaryEndpoint)
+		if ep != nil {
+			ep.DecRef()
+			t.Fatalf("got s.AcquireAssignedAddress(%s, false, NeverPrimaryEndpoint) = %s, want = nil", addr.Address, ep.AddressWithPrefix())
+		}
+	}
+	if s.IsInGroup(group) {
+		t.Fatalf("got s.IsInGroup(%s) = true, want = false", group)
+	}
+}
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 7dd344b4f..0cd1da11f 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -196,13 +196,14 @@ type bucket struct {
 
 // packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
 // TCP header.
+//
+// Preconditions: pkt.NetworkHeader() is valid.
 func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
-	// TODO(gvisor.dev/issue/170): Need to support for other
-	// protocols as well.
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-	if len(netHeader) < header.IPv4MinimumSize || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+	netHeader := pkt.Network()
+	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
 		return tupleID{}, tcpip.ErrUnknownProtocol
 	}
+
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 	if len(tcpHeader) < header.TCPMinimumSize {
 		return tupleID{}, tcpip.ErrUnknownProtocol
@@ -214,7 +215,7 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
 		dstAddr:    netHeader.DestinationAddress(),
 		dstPort:    tcpHeader.DestinationPort(),
 		transProto: netHeader.TransportProtocol(),
-		netProto:   header.IPv4ProtocolNumber,
+		netProto:   pkt.NetworkProtocolNumber,
 	}, nil
 }
 
@@ -268,7 +269,7 @@ func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) {
 	return nil, dirOriginal
 }
 
-func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn {
+func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt *RedirectTarget) *conn {
 	tid, err := packetToTupleID(pkt)
 	if err != nil {
 		return nil
@@ -281,8 +282,8 @@ func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt Redirec
 	// rule. This tuple will be used to manipulate the packet in
 	// handlePacket.
 	replyTID := tid.reply()
-	replyTID.srcAddr = rt.MinIP
-	replyTID.srcPort = rt.MinPort
+	replyTID.srcAddr = rt.Addr
+	replyTID.srcPort = rt.Port
 	var manip manipType
 	switch hook {
 	case Prerouting:
@@ -344,7 +345,7 @@ func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
 		return
 	}
 
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	netHeader := pkt.Network()
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For prerouting redirection, packets going in the original direction
@@ -366,8 +367,12 @@ func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
 	// support cases when they are validated, e.g. when we can't offload
 	// receive checksumming.
 
-	netHeader.SetChecksum(0)
-	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	// After modification, IPv4 packets need a valid checksum.
+	if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	}
 }
 
 // handlePacketOutput manipulates ports for packets in Output hook.
@@ -377,7 +382,7 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 		return
 	}
 
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	netHeader := pkt.Network()
 	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For output redirection, packets going in the original direction
@@ -396,7 +401,7 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 
 	// Calculate the TCP checksum and set it.
 	tcpHeader.SetChecksum(0)
-	length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
+	length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View()))
 	xsum := r.PseudoHeaderChecksum(header.TCPProtocolNumber, length)
 	if gso != nil && gso.NeedsCsum {
 		tcpHeader.SetChecksum(xsum)
@@ -405,8 +410,11 @@ func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir d
 		tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
 	}
 
-	netHeader.SetChecksum(0)
-	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+	}
 }
 
 // handlePacket will manipulate the port and address of the packet if the
@@ -422,7 +430,7 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
 	}
 
 	// TODO(gvisor.dev/issue/170): Support other transport protocols.
-	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+	if pkt.Network().TransportProtocol() != header.TCPProtocolNumber {
 		return false
 	}
 
@@ -473,7 +481,7 @@ func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) {
 	}
 
 	// We only track TCP connections.
-	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+	if pkt.Network().TransportProtocol() != header.TCPProtocolNumber {
 		return
 	}
 
@@ -572,7 +580,9 @@ func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, tim
 // reapTupleLocked tries to remove tuple and its reply from the table. It
 // returns whether the tuple's connection has timed out.
 //
-// Preconditions: ct.mu is locked for reading and bucket is locked.
+// Preconditions:
+// * ct.mu is locked for reading.
+// * bucket is locked.
 func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
 	if !tuple.conn.timedOut(now) {
 		return false
@@ -607,7 +617,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bo
 	return true
 }
 
-func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
+func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber) (tcpip.Address, uint16, *tcpip.Error) {
 	// Lookup the connection. The reply's original destination
 	// describes the original address.
 	tid := tupleID{
@@ -616,7 +626,7 @@ func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint1
 		dstAddr:    epID.RemoteAddress,
 		dstPort:    epID.RemotePort,
 		transProto: header.TCPProtocolNumber,
-		netProto:   header.IPv4ProtocolNumber,
+		netProto:   netProto,
 	}
 	conn, _ := ct.connForTID(tid)
 	if conn == nil {
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
deleted file mode 100644
index 5a684eb9d..000000000
--- a/pkg/tcpip/stack/forwarder_test.go
+++ /dev/null
@@ -1,650 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package stack
-
-import (
-	"encoding/binary"
-	"math"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-const (
-	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
-	fwdTestNetHeaderLen                                    = 12
-	fwdTestNetDefaultPrefixLen                             = 8
-
-	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
-	// except where another value is explicitly used. It is chosen to match
-	// the MTU of loopback interfaces on linux systems.
-	fwdTestNetDefaultMTU = 65536
-
-	dstAddrOffset        = 0
-	srcAddrOffset        = 1
-	protocolNumberOffset = 2
-)
-
-// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
-// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
-// use the first three: destination address, source address, and transport
-// protocol. They're all one byte fields to simplify parsing.
-type fwdTestNetworkEndpoint struct {
-	nicID      tcpip.NICID
-	proto      *fwdTestNetworkProtocol
-	dispatcher TransportDispatcher
-	ep         LinkEndpoint
-}
-
-func (f *fwdTestNetworkEndpoint) MTU() uint32 {
-	return f.ep.MTU() - uint32(f.MaxHeaderLength())
-}
-
-func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
-	return f.nicID
-}
-
-func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
-	return 123
-}
-
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
-	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
-}
-
-func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
-	return f.ep.MaxHeaderLength() + fwdTestNetHeaderLen
-}
-
-func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
-	return 0
-}
-
-func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
-	return f.ep.Capabilities()
-}
-
-func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return f.proto.Number()
-}
-
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
-	// Add the protocol's header to the packet and send it to the link
-	// endpoint.
-	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
-	b[dstAddrOffset] = r.RemoteAddress[0]
-	b[srcAddrOffset] = r.LocalAddress[0]
-	b[protocolNumberOffset] = byte(params.Protocol)
-
-	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
-}
-
-// WritePackets implements LinkEndpoint.WritePackets.
-func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
-	panic("not implemented")
-}
-
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
-	return tcpip.ErrNotSupported
-}
-
-func (*fwdTestNetworkEndpoint) Close() {}
-
-// fwdTestNetworkProtocol is a network-layer protocol that implements Address
-// resolution.
-type fwdTestNetworkProtocol struct {
-	addrCache              *linkAddrCache
-	addrResolveDelay       time.Duration
-	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress)
-	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
-}
-
-var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
-
-func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
-	return fwdTestNetNumber
-}
-
-func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
-	return fwdTestNetHeaderLen
-}
-
-func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
-	return fwdTestNetDefaultPrefixLen
-}
-
-func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
-}
-
-func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
-	netHeader, ok := pkt.NetworkHeader().Consume(fwdTestNetHeaderLen)
-	if !ok {
-		return 0, false, false
-	}
-	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
-}
-
-func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) NetworkEndpoint {
-	return &fwdTestNetworkEndpoint{
-		nicID:      nicID,
-		proto:      f,
-		dispatcher: dispatcher,
-		ep:         ep,
-	}
-}
-
-func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func (f *fwdTestNetworkProtocol) Close() {}
-
-func (f *fwdTestNetworkProtocol) Wait() {}
-
-func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
-	if f.addrCache != nil && f.onLinkAddressResolved != nil {
-		time.AfterFunc(f.addrResolveDelay, func() {
-			f.onLinkAddressResolved(f.addrCache, addr, remoteLinkAddr)
-		})
-	}
-	return nil
-}
-
-func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
-	if f.onResolveStaticAddress != nil {
-		return f.onResolveStaticAddress(addr)
-	}
-	return "", false
-}
-
-func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
-	return fwdTestNetNumber
-}
-
-// fwdTestPacketInfo holds all the information about an outbound packet.
-type fwdTestPacketInfo struct {
-	RemoteLinkAddress tcpip.LinkAddress
-	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               *PacketBuffer
-}
-
-type fwdTestLinkEndpoint struct {
-	dispatcher NetworkDispatcher
-	mtu        uint32
-	linkAddr   tcpip.LinkAddress
-
-	// C is where outbound packets are queued.
-	C chan fwdTestPacketInfo
-}
-
-// InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	e.InjectLinkAddr(protocol, "", pkt)
-}
-
-// InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
-}
-
-// Attach saves the stack network-layer dispatcher for use later when packets
-// are injected.
-func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
-	e.dispatcher = dispatcher
-}
-
-// IsAttached implements stack.LinkEndpoint.IsAttached.
-func (e *fwdTestLinkEndpoint) IsAttached() bool {
-	return e.dispatcher != nil
-}
-
-// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
-// during construction.
-func (e *fwdTestLinkEndpoint) MTU() uint32 {
-	return e.mtu
-}
-
-// Capabilities implements stack.LinkEndpoint.Capabilities.
-func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
-	caps := LinkEndpointCapabilities(0)
-	return caps | CapabilityResolutionRequired
-}
-
-// GSOMaxSize returns the maximum GSO packet size.
-func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
-	return 1 << 15
-}
-
-// MaxHeaderLength returns the maximum size of the link layer header. Given it
-// doesn't have a header, it just returns 0.
-func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
-	return 0
-}
-
-// LinkAddress returns the link address of this endpoint.
-func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
-	return e.linkAddr
-}
-
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
-	p := fwdTestPacketInfo{
-		RemoteLinkAddress: r.RemoteLinkAddress,
-		LocalLinkAddress:  r.LocalLinkAddress,
-		Pkt:               pkt,
-	}
-
-	select {
-	case e.C <- p:
-	default:
-	}
-
-	return nil
-}
-
-// WritePackets stores outbound packets into the channel.
-func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	n := 0
-	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.WritePacket(r, gso, protocol, pkt)
-		n++
-	}
-
-	return n, nil
-}
-
-// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
-func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	p := fwdTestPacketInfo{
-		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
-	}
-
-	select {
-	case e.C <- p:
-	default:
-	}
-
-	return nil
-}
-
-// Wait implements stack.LinkEndpoint.Wait.
-func (*fwdTestLinkEndpoint) Wait() {}
-
-// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
-func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
-	panic("not implemented")
-}
-
-// AddHeader implements stack.LinkEndpoint.AddHeader.
-func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	panic("not implemented")
-}
-
-func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
-	// Create a stack with the network protocol and two NICs.
-	s := New(Options{
-		NetworkProtocols: []NetworkProtocol{proto},
-	})
-
-	proto.addrCache = s.linkAddrCache
-
-	// Enable forwarding.
-	s.SetForwarding(true)
-
-	// NIC 1 has the link address "a", and added the network address 1.
-	ep1 = &fwdTestLinkEndpoint{
-		C:        make(chan fwdTestPacketInfo, 300),
-		mtu:      fwdTestNetDefaultMTU,
-		linkAddr: "a",
-	}
-	if err := s.CreateNIC(1, ep1); err != nil {
-		t.Fatal("CreateNIC #1 failed:", err)
-	}
-	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
-		t.Fatal("AddAddress #1 failed:", err)
-	}
-
-	// NIC 2 has the link address "b", and added the network address 2.
-	ep2 = &fwdTestLinkEndpoint{
-		C:        make(chan fwdTestPacketInfo, 300),
-		mtu:      fwdTestNetDefaultMTU,
-		linkAddr: "b",
-	}
-	if err := s.CreateNIC(2, ep2); err != nil {
-		t.Fatal("CreateNIC #2 failed:", err)
-	}
-	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
-		t.Fatal("AddAddress #2 failed:", err)
-	}
-
-	// Route all packets to NIC 2.
-	{
-		subnet, err := tcpip.NewSubnet("\x00", "\x00")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
-	}
-
-	return ep1, ep2
-}
-
-func TestForwardingWithStaticResolver(t *testing.T) {
-	// Create a network protocol with a static resolver.
-	proto := &fwdTestNetworkProtocol{
-		onResolveStaticAddress:
-		// The network address 3 is resolved to the link address "c".
-		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
-			if addr == "\x03" {
-				return "c", true
-			}
-			return "", false
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	default:
-		t.Fatal("packet not forwarded")
-	}
-
-	// Test that the static address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithFakeResolver(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any address will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
-
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithNoResolver(t *testing.T) {
-	// Create a network protocol without a resolver.
-	proto := &fwdTestNetworkProtocol{}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	select {
-	case <-ep2.C:
-		t.Fatal("Packet should not be forwarded")
-	case <-time.After(time.Second):
-	}
-}
-
-func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Only packets to address 3 will be resolved to the
-			// link address "c".
-			if addr == "\x03" {
-				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-			}
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject an inbound packet to address 4 on NIC 1. This packet should
-	// not be forwarded.
-	buf := buffer.NewView(30)
-	buf[dstAddrOffset] = 4
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	// Inject an inbound packet to address 3 on NIC 1, and see if it is
-	// forwarded to NIC 2.
-	buf = buffer.NewView(30)
-	buf[dstAddrOffset] = 3
-	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	var p fwdTestPacketInfo
-
-	select {
-	case p = <-ep2.C:
-	case <-time.After(time.Second):
-		t.Fatal("packet not forwarded")
-	}
-
-	if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-	}
-
-	// Test that the address resolution happened correctly.
-	if p.RemoteLinkAddress != "c" {
-		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-	}
-	if p.LocalLinkAddress != "b" {
-		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-	}
-}
-
-func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	// Inject two inbound packets to address 3 on NIC 1.
-	for i := 0; i < 2; i++ {
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < 2; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
-
-func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
-		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = 3
-		// Set the packet sequence number.
-		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingPacketsPerResolution; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		b := PayloadSince(p.Pkt.NetworkHeader())
-		if b[dstAddrOffset] != 3 {
-			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
-		}
-		if len(b) < fwdTestNetHeaderLen+2 {
-			t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
-		}
-		seqNumBuf := b[fwdTestNetHeaderLen:]
-
-		// The first 5 packets should not be forwarded so the sequence number should
-		// start with 5.
-		want := uint16(i + 5)
-		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
-			t.Fatalf("got the packet #%d, want = #%d", n, want)
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
-
-func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
-	// Create a network protocol with a fake resolver.
-	proto := &fwdTestNetworkProtocol{
-		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
-			// Any packets will be resolved to the link address "c".
-			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
-		},
-	}
-
-	ep1, ep2 := fwdTestNetFactory(t, proto)
-
-	for i := 0; i < maxPendingResolutions+5; i++ {
-		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
-		// Each packet has a different destination address (3 to
-		// maxPendingResolutions + 7).
-		buf := buffer.NewView(30)
-		buf[dstAddrOffset] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
-			Data: buf.ToVectorisedView(),
-		}))
-	}
-
-	for i := 0; i < maxPendingResolutions; i++ {
-		var p fwdTestPacketInfo
-
-		select {
-		case p = <-ep2.C:
-		case <-time.After(time.Second):
-			t.Fatal("packet not forwarded")
-		}
-
-		// The first 5 packets (address 3 to 7) should not be forwarded
-		// because their address resolutions are interrupted.
-		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
-			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
-		}
-
-		// Test that the address resolution happened correctly.
-		if p.RemoteLinkAddress != "c" {
-			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
-		}
-		if p.LocalLinkAddress != "b" {
-			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
-		}
-	}
-}
diff --git a/pkg/tcpip/stack/forwarding_test.go b/pkg/tcpip/stack/forwarding_test.go
new file mode 100644
index 000000000..380688038
--- /dev/null
+++ b/pkg/tcpip/stack/forwarding_test.go
@@ -0,0 +1,876 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
+	fwdTestNetHeaderLen                                    = 12
+	fwdTestNetDefaultPrefixLen                             = 8
+
+	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match
+	// the MTU of loopback interfaces on linux systems.
+	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
+)
+
+// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
+// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fwdTestNetworkEndpoint struct {
+	AddressableEndpointState
+
+	nic        NetworkInterface
+	proto      *fwdTestNetworkProtocol
+	dispatcher TransportDispatcher
+}
+
+var _ NetworkEndpoint = (*fwdTestNetworkEndpoint)(nil)
+
+func (*fwdTestNetworkEndpoint) Enable() *tcpip.Error {
+	return nil
+}
+
+func (*fwdTestNetworkEndpoint) Enabled() bool {
+	return true
+}
+
+func (*fwdTestNetworkEndpoint) Disable() {}
+
+func (f *fwdTestNetworkEndpoint) MTU() uint32 {
+	return f.nic.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
+}
+
+func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.nic.MaxHeaderLength() + fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = r.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
+
+	return f.nic.WritePacket(r, gso, fwdTestNetNumber, pkt)
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (f *fwdTestNetworkEndpoint) Close() {
+	f.AddressableEndpointState.Cleanup()
+}
+
+// fwdTestNetworkProtocol is a network-layer protocol that implements Address
+// resolution.
+type fwdTestNetworkProtocol struct {
+	addrCache              *linkAddrCache
+	neigh                  *neighborCache
+	addrResolveDelay       time.Duration
+	onLinkAddressResolved  func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress)
+	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
+
+	mu struct {
+		sync.RWMutex
+		forwarding bool
+	}
+}
+
+var _ NetworkProtocol = (*fwdTestNetworkProtocol)(nil)
+var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
+
+func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
+	return fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
+	return fwdTestNetDefaultPrefixLen
+}
+
+func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.NetworkHeader().Consume(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
+}
+
+func (f *fwdTestNetworkProtocol) NewEndpoint(nic NetworkInterface, _ LinkAddressCache, _ NUDHandler, dispatcher TransportDispatcher) NetworkEndpoint {
+	e := &fwdTestNetworkEndpoint{
+		nic:        nic,
+		proto:      f,
+		dispatcher: dispatcher,
+	}
+	e.AddressableEndpointState.Init(e)
+	return e
+}
+
+func (*fwdTestNetworkProtocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (*fwdTestNetworkProtocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (*fwdTestNetworkProtocol) Close() {}
+
+func (*fwdTestNetworkProtocol) Wait() {}
+
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, _ tcpip.Address, remoteLinkAddr tcpip.LinkAddress, _ NetworkInterface) *tcpip.Error {
+	if f.onLinkAddressResolved != nil {
+		time.AfterFunc(f.addrResolveDelay, func() {
+			f.onLinkAddressResolved(f.addrCache, f.neigh, addr, remoteLinkAddr)
+		})
+	}
+	return nil
+}
+
+func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if f.onResolveStaticAddress != nil {
+		return f.onResolveStaticAddress(addr)
+	}
+	return "", false
+}
+
+func (*fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (f *fwdTestNetworkProtocol) Forwarding() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.forwarding
+
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (f *fwdTestNetworkProtocol) SetForwarding(v bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.forwarding = v
+}
+
+// fwdTestPacketInfo holds all the information about an outbound packet.
+type fwdTestPacketInfo struct {
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalLinkAddress  tcpip.LinkAddress
+	Pkt               *PacketBuffer
+}
+
+type fwdTestLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+	mtu        uint32
+	linkAddr   tcpip.LinkAddress
+
+	// C is where outbound packets are queued.
+	C chan fwdTestPacketInfo
+}
+
+// InjectInbound injects an inbound packet.
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *fwdTestLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *fwdTestLinkEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	caps := LinkEndpointCapabilities(0)
+	return caps | CapabilityResolutionRequired
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		RemoteLinkAddress: r.RemoteLinkAddress,
+		LocalLinkAddress:  r.LocalLinkAddress,
+		Pkt:               pkt,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.WritePacket(r, gso, protocol, pkt)
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*fwdTestLinkEndpoint) Wait() {}
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("not implemented")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	panic("not implemented")
+}
+
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol, useNeighborCache bool) (ep1, ep2 *fwdTestLinkEndpoint) {
+	// Create a stack with the network protocol and two NICs.
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocolFactory{func(*Stack) NetworkProtocol { return proto }},
+		UseNeighborCache: useNeighborCache,
+	})
+
+	if !useNeighborCache {
+		proto.addrCache = s.linkAddrCache
+	}
+
+	// Enable forwarding.
+	s.SetForwarding(proto.Number(), true)
+
+	// NIC 1 has the link address "a", and added the network address 1.
+	ep1 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "a",
+	}
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
+	}
+
+	// NIC 2 has the link address "b", and added the network address 2.
+	ep2 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "b",
+	}
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
+
+	if useNeighborCache {
+		// Control the neighbor cache for NIC 2.
+		nic, ok := s.nics[2]
+		if !ok {
+			t.Fatal("failed to get the neighbor cache for NIC 2")
+		}
+		proto.neigh = nic.neigh
+	}
+
+	// Route all packets to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
+	}
+
+	return ep1, ep2
+}
+
+func TestForwardingWithStaticResolver(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Create a network protocol with a static resolver.
+			proto := &fwdTestNetworkProtocol{
+				onResolveStaticAddress:
+				// The network address 3 is resolved to the link address "c".
+				func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+					if addr == "\x03" {
+						return "c", true
+					}
+					return "", false
+				},
+			}
+
+			ep1, ep2 := fwdTestNetFactory(t, proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			default:
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the static address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolver(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any address will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any address will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithNoResolver(t *testing.T) {
+	// Create a network protocol without a resolver.
+	proto := &fwdTestNetworkProtocol{}
+
+	// Whether or not we use the neighbor cache here does not matter since
+	// neither linkAddrCache nor neighborCache will be used.
+	ep1, ep2 := fwdTestNetFactory(t, proto, false /* useNeighborCache */)
+
+	// inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	select {
+	case <-ep2.C:
+		t.Fatal("Packet should not be forwarded")
+	case <-time.After(time.Second):
+	}
+}
+
+func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+					}
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Only packets to address 3 will be resolved to the
+					// link address "c".
+					if addr == "\x03" {
+						neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+							Solicited: true,
+							Override:  false,
+							IsRouter:  false,
+						})
+					}
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject an inbound packet to address 4 on NIC 1. This packet should
+			// not be forwarded.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = 4
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			// Inject an inbound packet to address 3 on NIC 1, and see if it is
+			// forwarded to NIC 2.
+			buf = buffer.NewView(30)
+			buf[dstAddrOffset] = 3
+			ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			var p fwdTestPacketInfo
+
+			select {
+			case p = <-ep2.C:
+			case <-time.After(time.Second):
+				t.Fatal("packet not forwarded")
+			}
+
+			if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+				t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+			}
+
+			// Test that the address resolution happened correctly.
+			if p.RemoteLinkAddress != "c" {
+				t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+			}
+			if p.LocalLinkAddress != "b" {
+				t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			// Inject two inbound packets to address 3 on NIC 1.
+			for i := 0; i < 2; i++ {
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < 2; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+				// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = 3
+				// Set the packet sequence number.
+				binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < maxPendingPacketsPerResolution; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				b := PayloadSince(p.Pkt.NetworkHeader())
+				if b[dstAddrOffset] != 3 {
+					t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+				}
+				if len(b) < fwdTestNetHeaderLen+2 {
+					t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
+				}
+				seqNumBuf := b[fwdTestNetHeaderLen:]
+
+				// The first 5 packets should not be forwarded so the sequence number should
+				// start with 5.
+				want := uint16(i + 5)
+				if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
+					t.Fatalf("got the packet #%d, want = #%d", n, want)
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
+
+func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
+	tests := []struct {
+		name             string
+		useNeighborCache bool
+		proto            *fwdTestNetworkProtocol
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, _ tcpip.LinkAddress) {
+					// Any packets will be resolved to the link address "c".
+					cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+				},
+			},
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+			proto: &fwdTestNetworkProtocol{
+				addrResolveDelay: 500 * time.Millisecond,
+				onLinkAddressResolved: func(cache *linkAddrCache, neigh *neighborCache, addr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
+					t.Helper()
+					if len(remoteLinkAddr) != 0 {
+						t.Fatalf("got remoteLinkAddr=%q, want unspecified", remoteLinkAddr)
+					}
+					// Any packets will be resolved to the link address "c".
+					neigh.HandleConfirmation(addr, "c", ReachabilityConfirmationFlags{
+						Solicited: true,
+						Override:  false,
+						IsRouter:  false,
+					})
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep1, ep2 := fwdTestNetFactory(t, test.proto, test.useNeighborCache)
+
+			for i := 0; i < maxPendingResolutions+5; i++ {
+				// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+				// Each packet has a different destination address (3 to
+				// maxPendingResolutions + 7).
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = byte(3 + i)
+				ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
+					Data: buf.ToVectorisedView(),
+				}))
+			}
+
+			for i := 0; i < maxPendingResolutions; i++ {
+				var p fwdTestPacketInfo
+
+				select {
+				case p = <-ep2.C:
+				case <-time.After(time.Second):
+					t.Fatal("packet not forwarded")
+				}
+
+				// The first 5 packets (address 3 to 7) should not be forwarded
+				// because their address resolutions are interrupted.
+				if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
+					t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
+				}
+
+				// Test that the address resolution happened correctly.
+				if p.RemoteLinkAddress != "c" {
+					t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+				}
+				if p.LocalLinkAddress != "b" {
+					t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index c37da814f..8d6d9a7f1 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -57,14 +57,14 @@ const reaperDelay = 5 * time.Second
 // all packets.
 func DefaultTables() *IPTables {
 	return &IPTables{
-		tables: [numTables]Table{
+		v4Tables: [numTables]Table{
 			natID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting:  0,
@@ -83,9 +83,9 @@ func DefaultTables() *IPTables {
 			},
 			mangleID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting: 0,
@@ -101,10 +101,75 @@ func DefaultTables() *IPTables {
 			},
 			filterID: Table{
 				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
+				},
+			},
+		},
+		v6Tables: [numTables]Table{
+			natID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting:  0,
+					Input:       1,
+					Forward:     HookUnset,
+					Output:      2,
+					Postrouting: 3,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  0,
+					Input:       1,
+					Forward:     HookUnset,
+					Output:      2,
+					Postrouting: 3,
+				},
+			},
+			mangleID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+				},
+				BuiltinChains: [NumHooks]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				Underflows: [NumHooks]int{
+					Prerouting:  0,
+					Input:       HookUnset,
+					Forward:     HookUnset,
+					Output:      1,
+					Postrouting: HookUnset,
+				},
+			},
+			filterID: Table{
+				Rules: []Rule{
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
+					Rule{Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
 				},
 				BuiltinChains: [NumHooks]int{
 					Prerouting:  HookUnset,
@@ -165,18 +230,21 @@ func EmptyNATTable() Table {
 }
 
 // GetTable returns a table by name.
-func (it *IPTables) GetTable(name string) (Table, bool) {
+func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) {
 	id, ok := nameToID[name]
 	if !ok {
 		return Table{}, false
 	}
 	it.mu.RLock()
 	defer it.mu.RUnlock()
-	return it.tables[id], true
+	if ipv6 {
+		return it.v6Tables[id], true
+	}
+	return it.v4Tables[id], true
 }
 
 // ReplaceTable replaces or inserts table by name.
-func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
+func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error {
 	id, ok := nameToID[name]
 	if !ok {
 		return tcpip.ErrInvalidOptionValue
@@ -190,7 +258,11 @@ func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
 		it.startReaper(reaperDelay)
 	}
 	it.modified = true
-	it.tables[id] = table
+	if ipv6 {
+		it.v6Tables[id] = table
+	} else {
+		it.v4Tables[id] = table
+	}
 	return nil
 }
 
@@ -213,8 +285,15 @@ const (
 // should continue traversing the network stack and false when it should be
 // dropped.
 //
+// TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from
+// which address and nicName can be gathered. Currently, address is only
+// needed for prerouting and nicName is only needed for output.
+//
 // Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool {
+	if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber {
+		return true
+	}
 	// Many users never configure iptables. Spare them the cost of rule
 	// traversal if rules have never been set.
 	it.mu.RLock()
@@ -235,9 +314,14 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 		if tableID == natID && pkt.NatDone {
 			continue
 		}
-		table := it.tables[tableID]
+		var table Table
+		if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber {
+			table = it.v6Tables[tableID]
+		} else {
+			table = it.v4Tables[tableID]
+		}
 		ruleIdx := table.BuiltinChains[hook]
-		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
 		// If the table returns Accept, move on to the next table.
 		case chainAccept:
 			continue
@@ -248,7 +332,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 			// Any Return from a built-in chain means we have to
 			// call the underflow.
 			underflow := table.Rules[table.Underflows[hook]]
-			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v {
+			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr); v {
 			case RuleAccept:
 				continue
 			case RuleDrop:
@@ -315,8 +399,8 @@ func (it *IPTables) startReaper(interval time.Duration) {
 // should not go forward.
 //
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
 //
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
@@ -341,13 +425,13 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
-func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
+func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
 	for ruleIdx < len(table.Rules) {
-		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
 		case RuleAccept:
 			return chainAccept
 
@@ -364,7 +448,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 				ruleIdx++
 				continue
 			}
-			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict {
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict {
 			case chainAccept:
 				return chainAccept
 			case chainDrop:
@@ -388,13 +472,13 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 }
 
 // Preconditions:
-// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// - pkt.NetworkHeader is not nil.
-func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
+// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// * pkt.NetworkHeader is not nil.
+func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
 	// Check whether the packet matches the IP header filter.
-	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) {
+	if !rule.Filter.match(pkt, hook, nicName) {
 		// Continue on to the next rule.
 		return RuleJump, ruleIdx + 1
 	}
@@ -413,11 +497,16 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	}
 
 	// All the matchers matched, so run the target.
-	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
+	return rule.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr)
 }
 
 // OriginalDst returns the original destination of redirected connections. It
 // returns an error if the connection doesn't exist or isn't redirected.
-func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
-	return it.connections.originalDst(epID)
+func (it *IPTables) OriginalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber) (tcpip.Address, uint16, *tcpip.Error) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	if !it.modified {
+		return "", 0, tcpip.ErrNotConnected
+	}
+	return it.connections.originalDst(epID, netProto)
 }
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 5f1b2af64..538c4625d 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -21,78 +21,139 @@ import (
 )
 
 // AcceptTarget accepts packets.
-type AcceptTarget struct{}
+type AcceptTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (at *AcceptTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: at.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleAccept, 0
 }
 
 // DropTarget drops packets.
-type DropTarget struct{}
+type DropTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (dt *DropTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: dt.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleDrop, 0
 }
 
+// ErrorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const ErrorTargetName = "ERROR"
+
 // ErrorTarget logs an error and drops the packet. It represents a target that
 // should be unreachable.
-type ErrorTarget struct{}
+type ErrorTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (et *ErrorTarget) ID() TargetID {
+	return TargetID{
+		Name:            ErrorTargetName,
+		NetworkProtocol: et.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, 0
 }
 
 // UserChainTarget marks a rule as the beginning of a user chain.
 type UserChainTarget struct {
+	// Name is the chain name.
 	Name string
+
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (uc *UserChainTarget) ID() TargetID {
+	return TargetID{
+		Name:            ErrorTargetName,
+		NetworkProtocol: uc.NetworkProtocol,
+	}
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
 // ReturnTarget returns from the current chain. If the chain is a built-in, the
 // hook's underflow should be called.
-type ReturnTarget struct{}
+type ReturnTarget struct {
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// ID implements Target.ID.
+func (rt *ReturnTarget) ID() TargetID {
+	return TargetID{
+		NetworkProtocol: rt.NetworkProtocol,
+	}
+}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (*ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleReturn, 0
 }
 
+// RedirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port/destination IP for packets.
+const RedirectTargetName = "REDIRECT"
+
 // RedirectTarget redirects the packet by modifying the destination port/IP.
-// Min and Max values for IP and Ports in the struct indicate the range of
-// values which can be used to redirect.
+// TODO(gvisor.dev/issue/170): Other flags need to be added after we support
+// them.
 type RedirectTarget struct {
-	// TODO(gvisor.dev/issue/170): Other flags need to be added after
-	// we support them.
-	// RangeProtoSpecified flag indicates single port is specified to
-	// redirect.
-	RangeProtoSpecified bool
+	// Addr indicates address used to redirect.
+	Addr tcpip.Address
 
-	// MinIP indicates address used to redirect.
-	MinIP tcpip.Address
+	// Port indicates port used to redirect.
+	Port uint16
 
-	// MaxIP indicates address used to redirect.
-	MaxIP tcpip.Address
-
-	// MinPort indicates port used to redirect.
-	MinPort uint16
+	// NetworkProtocol is the network protocol the target is used with.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+}
 
-	// MaxPort indicates port used to redirect.
-	MaxPort uint16
+// ID implements Target.ID.
+func (rt *RedirectTarget) ID() TargetID {
+	return TargetID{
+		Name:            RedirectTargetName,
+		NetworkProtocol: rt.NetworkProtocol,
+	}
 }
 
 // Action implements Target.Action.
 // TODO(gvisor.dev/issue/170): Parse headers without copying. The current
 // implementation only works for PREROUTING and calls pkt.Clone(), neither
 // of which should be the case.
-func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
 	// Packet is already manipulated.
 	if pkt.NatDone {
 		return RuleAccept, 0
@@ -103,34 +164,35 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso
 		return RuleDrop, 0
 	}
 
-	// Change the address to localhost (127.0.0.1) in Output and
-	// to primary address of the incoming interface in Prerouting.
+	// Change the address to localhost (127.0.0.1 or ::1) in Output and to
+	// the primary address of the incoming interface in Prerouting.
 	switch hook {
 	case Output:
-		rt.MinIP = tcpip.Address([]byte{127, 0, 0, 1})
-		rt.MaxIP = tcpip.Address([]byte{127, 0, 0, 1})
+		if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+			rt.Addr = tcpip.Address([]byte{127, 0, 0, 1})
+		} else {
+			rt.Addr = header.IPv6Loopback
+		}
 	case Prerouting:
-		rt.MinIP = address
-		rt.MaxIP = address
+		rt.Addr = address
 	default:
 		panic("redirect target is supported only on output and prerouting hooks")
 	}
 
 	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
 	// we need to change dest address (for OUTPUT chain) or ports.
-	netHeader := header.IPv4(pkt.NetworkHeader().View())
-	switch protocol := netHeader.TransportProtocol(); protocol {
+	switch protocol := pkt.TransportProtocolNumber; protocol {
 	case header.UDPProtocolNumber:
 		udpHeader := header.UDP(pkt.TransportHeader().View())
-		udpHeader.SetDestinationPort(rt.MinPort)
+		udpHeader.SetDestinationPort(rt.Port)
 
 		// Calculate UDP checksum and set it.
 		if hook == Output {
 			udpHeader.SetChecksum(0)
-			length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
 
 			// Only calculate the checksum if offloading isn't supported.
 			if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+				length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View()))
 				xsum := r.PseudoHeaderChecksum(protocol, length)
 				for _, v := range pkt.Data.Views() {
 					xsum = header.Checksum(v, xsum)
@@ -139,10 +201,15 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso
 				udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum))
 			}
 		}
-		// Change destination address.
-		netHeader.SetDestinationAddress(rt.MinIP)
-		netHeader.SetChecksum(0)
-		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+
+		pkt.Network().SetDestinationAddress(rt.Addr)
+
+		// After modification, IPv4 packets need a valid checksum.
+		if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+			netHeader := header.IPv4(pkt.NetworkHeader().View())
+			netHeader.SetChecksum(0)
+			netHeader.SetChecksum(^netHeader.CalculateChecksum())
+		}
 		pkt.NatDone = true
 	case header.TCPProtocolNumber:
 		if ct == nil {
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index 73274ada9..7b3f3e88b 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -15,6 +15,7 @@
 package stack
 
 import (
+	"fmt"
 	"strings"
 	"sync"
 
@@ -81,31 +82,42 @@ const (
 //
 // +stateify savable
 type IPTables struct {
-	// mu protects tables, priorities, and modified.
+	// mu protects v4Tables, v6Tables, and modified.
 	mu sync.RWMutex
-
-	// tables maps tableIDs to tables. Holds builtin tables only, not user
-	// tables. mu must be locked for accessing.
-	tables [numTables]Table
-
-	// priorities maps each hook to a list of table names. The order of the
-	// list is the order in which each table should be visited for that
-	// hook. mu needs to be locked for accessing.
-	priorities [NumHooks][]tableID
-
+	// v4Tables and v6tables map tableIDs to tables. They hold builtin
+	// tables only, not user tables. mu must be locked for accessing.
+	v4Tables [numTables]Table
+	v6Tables [numTables]Table
 	// modified is whether tables have been modified at least once. It is
 	// used to elide the iptables performance overhead for workloads that
 	// don't utilize iptables.
 	modified bool
 
+	// priorities maps each hook to a list of table names. The order of the
+	// list is the order in which each table should be visited for that
+	// hook. It is immutable.
+	priorities [NumHooks][]tableID
+
 	connections ConnTrack
 
-	// reaperDone can be signalled to stop the reaper goroutine.
+	// reaperDone can be signaled to stop the reaper goroutine.
 	reaperDone chan struct{}
 }
 
-// A Table defines a set of chains and hooks into the network stack. It is
-// really just a list of rules.
+// A Table defines a set of chains and hooks into the network stack.
+//
+// It is a list of Rules, entry points (BuiltinChains), and error handlers
+// (Underflows). As packets traverse netstack, they hit hooks. When a packet
+// hits a hook, iptables compares it to Rules starting from that hook's entry
+// point. So if a packet hits the Input hook, we look up the corresponding
+// entry point in BuiltinChains and jump to that point.
+//
+// If the Rule doesn't match the packet, iptables continues to the next Rule.
+// If a Rule does match, it can issue a verdict on the packet (e.g. RuleAccept
+// or RuleDrop) that causes the packet to stop traversing iptables. It can also
+// jump to other rules or perform custom actions based on Rule.Target.
+//
+// Underflow Rules are invoked when a chain returns without reaching a verdict.
 //
 // +stateify savable
 type Table struct {
@@ -148,13 +160,18 @@ type Rule struct {
 	Target Target
 }
 
-// IPHeaderFilter holds basic IP filtering data common to every rule.
+// IPHeaderFilter performs basic IP header matching common to every rule.
 //
 // +stateify savable
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
 
+	// CheckProtocol determines whether the Protocol field should be
+	// checked during matching.
+	// TODO(gvisor.dev/issue/3549): Check this field during matching.
+	CheckProtocol bool
+
 	// Dst matches the destination IP address.
 	Dst tcpip.Address
 
@@ -191,16 +208,43 @@ type IPHeaderFilter struct {
 	OutputInterfaceInvert bool
 }
 
-// match returns whether hdr matches the filter.
-func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+// match returns whether pkt matches the filter.
+//
+// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4
+// or IPv6 header length.
+func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool {
+	// Extract header fields.
+	var (
+		// TODO(gvisor.dev/issue/170): Support other filter fields.
+		transProto tcpip.TransportProtocolNumber
+		dstAddr    tcpip.Address
+		srcAddr    tcpip.Address
+	)
+	switch proto := pkt.NetworkProtocolNumber; proto {
+	case header.IPv4ProtocolNumber:
+		hdr := header.IPv4(pkt.NetworkHeader().View())
+		transProto = hdr.TransportProtocol()
+		dstAddr = hdr.DestinationAddress()
+		srcAddr = hdr.SourceAddress()
+
+	case header.IPv6ProtocolNumber:
+		hdr := header.IPv6(pkt.NetworkHeader().View())
+		transProto = hdr.TransportProtocol()
+		dstAddr = hdr.DestinationAddress()
+		srcAddr = hdr.SourceAddress()
+
+	default:
+		panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto))
+	}
+
 	// Check the transport protocol.
-	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+	if fl.CheckProtocol && fl.Protocol != transProto {
 		return false
 	}
 
-	// Check the source and destination IPs.
-	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+	// Check the addresses.
+	if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) ||
+		!filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) {
 		return false
 	}
 
@@ -228,6 +272,18 @@ func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool
 	return true
 }
 
+// NetworkProtocol returns the protocol (IPv4 or IPv6) on to which the header
+// applies.
+func (fl IPHeaderFilter) NetworkProtocol() tcpip.NetworkProtocolNumber {
+	switch len(fl.Src) {
+	case header.IPv4AddressSize:
+		return header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprintf("invalid address in IPHeaderFilter: %s", fl.Src))
+}
+
 // filterAddress returns whether addr matches the filter.
 func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
 	matches := true
@@ -253,8 +309,23 @@ type Matcher interface {
 	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
+// A TargetID uniquely identifies a target.
+type TargetID struct {
+	// Name is the target name as stored in the xt_entry_target struct.
+	Name string
+
+	// NetworkProtocol is the protocol to which the target applies.
+	NetworkProtocol tcpip.NetworkProtocolNumber
+
+	// Revision is the version of the target.
+	Revision uint8
+}
+
 // A Target is the interface for taking an action for a packet.
 type Target interface {
+	// ID uniquely identifies the Target.
+	ID() TargetID
+
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the index of the rule to jump to.
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 6f73a0ce4..c9b13cd0e 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -180,7 +180,7 @@ func (c *linkAddrCache) getOrCreateEntryLocked(k tcpip.FullAddress) *linkAddrEnt
 }
 
 // get reports any known link address for k.
-func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, nic NetworkInterface, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
 	if linkRes != nil {
 		if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok {
 			return addr, nil, nil
@@ -221,7 +221,7 @@ func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, lo
 			}
 
 			entry.done = make(chan struct{})
-			go c.startAddressResolution(k, linkRes, localAddr, linkEP, entry.done) // S/R-SAFE: link non-savable; wakers dropped synchronously.
+			go c.startAddressResolution(k, linkRes, localAddr, nic, entry.done) // S/R-SAFE: link non-savable; wakers dropped synchronously.
 		}
 
 		return entry.linkAddr, entry.done, tcpip.ErrWouldBlock
@@ -240,11 +240,11 @@ func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) {
 	}
 }
 
-func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, done <-chan struct{}) {
+func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, nic NetworkInterface, done <-chan struct{}) {
 	for i := 0; ; i++ {
 		// Send link request, then wait for the timeout limit and check
 		// whether the request succeeded.
-		linkRes.LinkAddressRequest(k.Addr, localAddr, "" /* linkAddr */, linkEP)
+		linkRes.LinkAddressRequest(k.Addr, localAddr, "" /* linkAddr */, nic)
 
 		select {
 		case now := <-time.After(c.resolutionTimeout):
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index b15b8d1cb..d2e37f38d 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"fmt"
+	"math"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -48,8 +49,8 @@ type testLinkAddressResolver struct {
 	onLinkAddressRequest func()
 }
 
-func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error {
-	time.AfterFunc(r.delay, func() { r.fakeRequest(addr) })
+func (r *testLinkAddressResolver) LinkAddressRequest(targetAddr, _ tcpip.Address, _ tcpip.LinkAddress, _ NetworkInterface) *tcpip.Error {
+	time.AfterFunc(r.delay, func() { r.fakeRequest(targetAddr) })
 	if f := r.onLinkAddressRequest; f != nil {
 		f()
 	}
@@ -191,7 +192,13 @@ func TestCacheReplace(t *testing.T) {
 }
 
 func TestCacheResolution(t *testing.T) {
-	c := newLinkAddrCache(1<<63-1, 250*time.Millisecond, 1)
+	// There is a race condition causing this test to fail when the executor
+	// takes longer than the resolution timeout to call linkAddrCache.get. This
+	// is especially common when this test is run with gotsan.
+	//
+	// Using a large resolution timeout decreases the probability of experiencing
+	// this race condition and does not affect how long this test takes to run.
+	c := newLinkAddrCache(1<<63-1, math.MaxInt64, 1)
 	linkRes := &testLinkAddressResolver{cache: c}
 	for i, ta := range testAddrs {
 		got, err := getBlocking(c, ta.addr, linkRes)
@@ -275,3 +282,71 @@ func TestStaticResolution(t *testing.T) {
 		t.Errorf("c.get(%q)=%q, want %q", string(addr), string(got), string(want))
 	}
 }
+
+// TestCacheWaker verifies that RemoveWaker removes a waker previously added
+// through get().
+func TestCacheWaker(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+
+	// First, sanity check that wakers are working.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 1
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[0]
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+		id, ok := s.Fetch(true /* block */)
+		if !ok {
+			t.Fatal("got s.Fetch(true) = (_, false), want = (_, true)")
+		}
+		if id != wakerID {
+			t.Fatalf("got s.Fetch(true) = (%d, %t), want = (%d, true)", id, ok, wakerID)
+		}
+
+		if got, _, err := c.get(e.addr, linkRes, "", nil, nil); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("got c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+	}
+
+	// Check that RemoveWaker works.
+	{
+		linkRes := &testLinkAddressResolver{cache: c}
+		s := sleep.Sleeper{}
+		defer s.Done()
+
+		const wakerID = 2 // different than the ID used in the sanity check
+		w := sleep.Waker{}
+		s.AddWaker(&w, wakerID)
+
+		e := testAddrs[1]
+		linkRes.onLinkAddressRequest = func() {
+			// Remove the waker before the linkAddrCache has the opportunity to send
+			// a notification.
+			c.removeWaker(e.addr, &w)
+		}
+
+		if _, _, err := c.get(e.addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			t.Fatalf("got c.get(%q, _, _, _, _) = %s, want = %s", e.addr.Addr, err, tcpip.ErrWouldBlock)
+		}
+
+		if got, err := getBlocking(c, e.addr, linkRes); err != nil {
+			t.Fatalf("c.get(%q, _, _, _, _): %s", e.addr.Addr, err)
+		} else if got != e.linkAddr {
+			t.Fatalf("c.get(%q) = %q, want = %q", e.addr.Addr, got, e.linkAddr)
+		}
+
+		if id, ok := s.Fetch(false /* block */); ok {
+			t.Fatalf("unexpected notification from waker with id %d", id)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 21bf53010..73a01c2dd 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -150,10 +150,10 @@ type ndpDNSSLEvent struct {
 
 type ndpDHCPv6Event struct {
 	nicID         tcpip.NICID
-	configuration stack.DHCPv6ConfigurationFromNDPRA
+	configuration ipv6.DHCPv6ConfigurationFromNDPRA
 }
 
-var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+var _ ipv6.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
 // related events happen for test purposes.
@@ -170,7 +170,7 @@ type ndpDispatcher struct {
 	dhcpv6ConfigurationC chan ndpDHCPv6Event
 }
 
-// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+// Implements ipv6.NDPDispatcher.OnDuplicateAddressDetectionStatus.
 func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
 	if n.dadC != nil {
 		n.dadC <- ndpDADEvent{
@@ -182,7 +182,7 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, add
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
+// Implements ipv6.NDPDispatcher.OnDefaultRouterDiscovered.
 func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool {
 	if c := n.routerC; c != nil {
 		c <- ndpRouterEvent{
@@ -195,7 +195,7 @@ func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.
 	return n.rememberRouter
 }
 
-// Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
+// Implements ipv6.NDPDispatcher.OnDefaultRouterInvalidated.
 func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) {
 	if c := n.routerC; c != nil {
 		c <- ndpRouterEvent{
@@ -206,7 +206,7 @@ func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip
 	}
 }
 
-// Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
+// Implements ipv6.NDPDispatcher.OnOnLinkPrefixDiscovered.
 func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool {
 	if c := n.prefixC; c != nil {
 		c <- ndpPrefixEvent{
@@ -219,7 +219,7 @@ func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip
 	return n.rememberPrefix
 }
 
-// Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
+// Implements ipv6.NDPDispatcher.OnOnLinkPrefixInvalidated.
 func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) {
 	if c := n.prefixC; c != nil {
 		c <- ndpPrefixEvent{
@@ -261,7 +261,7 @@ func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpi
 	}
 }
 
-// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+// Implements ipv6.NDPDispatcher.OnRecursiveDNSServerOption.
 func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
 	if c := n.rdnssC; c != nil {
 		c <- ndpRDNSSEvent{
@@ -274,7 +274,7 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDNSSearchListOption.
+// Implements ipv6.NDPDispatcher.OnDNSSearchListOption.
 func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration) {
 	if n.dnsslC != nil {
 		n.dnsslC <- ndpDNSSLEvent{
@@ -285,8 +285,8 @@ func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []s
 	}
 }
 
-// Implements stack.NDPDispatcher.OnDHCPv6Configuration.
-func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
+// Implements ipv6.NDPDispatcher.OnDHCPv6Configuration.
+func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration ipv6.DHCPv6ConfigurationFromNDPRA) {
 	if c := n.dhcpv6ConfigurationC; c != nil {
 		c <- ndpDHCPv6Event{
 			nicID,
@@ -319,13 +319,12 @@ func TestDADDisabled(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent, 1),
 	}
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPDisp:          &ndpDisp,
-	}
-
 	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPDisp: &ndpDisp,
+		})},
+	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
@@ -413,19 +412,21 @@ func TestDADResolve(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent),
 			}
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
-			}
-			opts.NDPConfigs.RetransmitTimer = test.retransTimer
-			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
 			e := channelLinkWithHeaderLength{
 				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
 				headerLength: test.linkHeaderLen,
 			}
 			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp: &ndpDisp,
+					NDPConfigs: ipv6.NDPConfigurations{
+						RetransmitTimer:        test.retransTimer,
+						DupAddrDetectTransmits: test.dupAddrDetectTransmits,
+					},
+				})},
+			})
 			if err := s.CreateNIC(nicID, &e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -558,6 +559,26 @@ func TestDADResolve(t *testing.T) {
 	}
 }
 
+func rxNDPSolicit(e *channel.Endpoint, tgt tcpip.Address) {
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns.SetTargetAddress(tgt)
+	snmc := header.SolicitedNodeAddr(tgt)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(icmp.ProtocolNumber6),
+		HopLimit:      255,
+		SrcAddr:       header.IPv6Any,
+		DstAddr:       snmc,
+	})
+	e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()}))
+}
+
 // TestDADFail tests to make sure that the DAD process fails if another node is
 // detected to be performing DAD on the same address (receive an NS message from
 // a node doing DAD for the same address), or if another node is detected to own
@@ -567,39 +588,19 @@ func TestDADFail(t *testing.T) {
 
 	tests := []struct {
 		name    string
-		makeBuf func(tgt tcpip.Address) buffer.Prependable
+		rxPkt   func(e *channel.Endpoint, tgt tcpip.Address)
 		getStat func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
 	}{
 		{
-			"RxSolicit",
-			func(tgt tcpip.Address) buffer.Prependable {
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
-				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
-				pkt.SetType(header.ICMPv6NeighborSolicit)
-				ns := header.NDPNeighborSolicit(pkt.NDPPayload())
-				ns.SetTargetAddress(tgt)
-				snmc := header.SolicitedNodeAddr(tgt)
-				pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
-				payloadLength := hdr.UsedLength()
-				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
-				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(payloadLength),
-					NextHeader:    uint8(icmp.ProtocolNumber6),
-					HopLimit:      255,
-					SrcAddr:       header.IPv6Any,
-					DstAddr:       snmc,
-				})
-
-				return hdr
-
-			},
-			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			name:  "RxSolicit",
+			rxPkt: rxNDPSolicit,
+			getStat: func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return s.NeighborSolicit
 			},
 		},
 		{
-			"RxAdvert",
-			func(tgt tcpip.Address) buffer.Prependable {
+			name: "RxAdvert",
+			rxPkt: func(e *channel.Endpoint, tgt tcpip.Address) {
 				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
 				pkt := header.ICMPv6(hdr.Prepend(naSize))
@@ -621,11 +622,9 @@ func TestDADFail(t *testing.T) {
 					SrcAddr:       tgt,
 					DstAddr:       header.IPv6AllNodesMulticastAddress,
 				})
-
-				return hdr
-
+				e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{Data: hdr.View().ToVectorisedView()}))
 			},
-			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+			getStat: func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
 				return s.NeighborAdvert
 			},
 		},
@@ -636,16 +635,16 @@ func TestDADFail(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent, 1),
 			}
-			ndpConfigs := stack.DefaultNDPConfigurations()
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs:       ndpConfigs,
-				NDPDisp:          &ndpDisp,
-			}
-			opts.NDPConfigs.RetransmitTimer = time.Second * 2
+			ndpConfigs := ipv6.DefaultNDPConfigurations()
+			ndpConfigs.RetransmitTimer = time.Second * 2
 
 			e := channel.New(0, 1280, linkAddr1)
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp:    &ndpDisp,
+					NDPConfigs: ndpConfigs,
+				})},
+			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
@@ -664,13 +663,8 @@ func TestDADFail(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
-			// Receive a packet to simulate multiple nodes owning or
-			// attempting to own the same address.
-			hdr := test.makeBuf(addr1)
-			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-				Data: hdr.View().ToVectorisedView(),
-			})
-			e.InjectInbound(header.IPv6ProtocolNumber, pkt)
+			// Receive a packet to simulate an address conflict.
+			test.rxPkt(e, addr1)
 
 			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
 			if got := stat.Value(); got != 1 {
@@ -754,18 +748,19 @@ func TestDADStop(t *testing.T) {
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent, 1),
 			}
-			ndpConfigs := stack.NDPConfigurations{
+
+			ndpConfigs := ipv6.NDPConfigurations{
 				RetransmitTimer:        time.Second,
 				DupAddrDetectTransmits: 2,
 			}
-			opts := stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
-				NDPConfigs:       ndpConfigs,
-			}
 
 			e := channel.New(0, 1280, linkAddr1)
-			s := stack.New(opts)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp:    &ndpDisp,
+					NDPConfigs: ndpConfigs,
+				})},
+			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
 			}
@@ -815,19 +810,6 @@ func TestDADStop(t *testing.T) {
 	}
 }
 
-// TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
-// we attempt to update NDP configurations using an invalid NICID.
-func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-	})
-
-	// No NIC with ID 1 yet.
-	if got := s.SetNDPConfigurations(1, stack.NDPConfigurations{}); got != tcpip.ErrUnknownNICID {
-		t.Fatalf("got s.SetNDPConfigurations = %v, want = %s", got, tcpip.ErrUnknownNICID)
-	}
-}
-
 // TestSetNDPConfigurations tests that we can update and use per-interface NDP
 // configurations without affecting the default NDP configurations or other
 // interfaces' configurations.
@@ -863,8 +845,9 @@ func TestSetNDPConfigurations(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPDisp:          &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPDisp: &ndpDisp,
+				})},
 			})
 
 			expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) {
@@ -892,12 +875,15 @@ func TestSetNDPConfigurations(t *testing.T) {
 			}
 
 			// Update the NDP configurations on NIC(1) to use DAD.
-			configs := stack.NDPConfigurations{
+			configs := ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
 				RetransmitTimer:        test.retransmitTimer,
 			}
-			if err := s.SetNDPConfigurations(nicID1, configs); err != nil {
-				t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err)
+			if ipv6Ep, err := s.GetNetworkEndpoint(nicID1, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID1, header.IPv6ProtocolNumber, err)
+			} else {
+				ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+				ndpEP.SetNDPConfigurations(configs)
 			}
 
 			// Created after updating NIC(1)'s NDP configurations
@@ -1113,14 +1099,15 @@ func TestNoRouterDiscovery(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					DiscoverDefaultRouters: discover,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						DiscoverDefaultRouters: discover,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1151,12 +1138,13 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1192,12 +1180,13 @@ func TestRouterDiscovery(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	expectRouterEvent := func(addr tcpip.Address, discovered bool) {
@@ -1285,7 +1274,7 @@ func TestRouterDiscovery(t *testing.T) {
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
-// stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
+// ipv6.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
@@ -1293,12 +1282,13 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1306,14 +1296,14 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 	}
 
 	// Receive an RA from 2 more than the max number of discovered routers.
-	for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
+	for i := 1; i <= ipv6.MaxDiscoveredDefaultRouters+2; i++ {
 		linkAddr := []byte{2, 2, 3, 4, 5, 0}
 		linkAddr[5] = byte(i)
 		llAddr := header.LinkLocalAddr(tcpip.LinkAddress(linkAddr))
 
 		e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
 
-		if i <= stack.MaxDiscoveredDefaultRouters {
+		if i <= ipv6.MaxDiscoveredDefaultRouters {
 			select {
 			case e := <-ndpDisp.routerC:
 				if diff := checkRouterEvent(e, llAddr, true); diff != "" {
@@ -1358,14 +1348,15 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					DiscoverOnLinkPrefixes: discover,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						DiscoverOnLinkPrefixes: discover,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1399,13 +1390,14 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: false,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: false,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1445,12 +1437,13 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1545,12 +1538,13 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1621,33 +1615,34 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 }
 
 // TestPrefixDiscoveryMaxRouters tests that only
-// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
+// ipv6.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	ndpDisp := ndpDispatcher{
-		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
+		prefixC:        make(chan ndpPrefixEvent, ipv6.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: false,
-			DiscoverOnLinkPrefixes: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				DiscoverDefaultRouters: false,
+				DiscoverOnLinkPrefixes: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
 		t.Fatalf("CreateNIC(1) = %s", err)
 	}
 
-	optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
-	prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
+	optSer := make(header.NDPOptionsSerializer, ipv6.MaxDiscoveredOnLinkPrefixes+2)
+	prefixes := [ipv6.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
 
 	// Receive an RA with 2 more than the max number of discovered on-link
 	// prefixes.
-	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+	for i := 0; i < ipv6.MaxDiscoveredOnLinkPrefixes+2; i++ {
 		prefixAddr := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0}
 		prefixAddr[7] = byte(i)
 		prefix := tcpip.AddressWithPrefix{
@@ -1665,8 +1660,8 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 	}
 
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
-	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
-		if i < stack.MaxDiscoveredOnLinkPrefixes {
+	for i := 0; i < ipv6.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		if i < ipv6.MaxDiscoveredOnLinkPrefixes {
 			select {
 			case e := <-ndpDisp.prefixC:
 				if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" {
@@ -1716,14 +1711,15 @@ func TestNoAutoGenAddr(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              handle,
-					AutoGenGlobalAddresses: autogen,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              handle,
+						AutoGenGlobalAddresses: autogen,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
-			s.SetForwarding(forwarding)
+			s.SetForwarding(ipv6.ProtocolNumber, forwarding)
 
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -1749,14 +1745,14 @@ func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix,
 
 // TestAutoGenAddr tests that an address is properly generated and invalidated
 // when configured to do so.
-func TestAutoGenAddr(t *testing.T) {
+func TestAutoGenAddr2(t *testing.T) {
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
 	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
@@ -1766,12 +1762,13 @@ func TestAutoGenAddr(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -1876,14 +1873,14 @@ func TestAutoGenTempAddr(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMinPrefixInformationValidLifetimeForUpdate := stack.MinPrefixInformationValidLifetimeForUpdate
-	savedMaxDesync := stack.MaxDesyncFactor
+	savedMinPrefixInformationValidLifetimeForUpdate := ipv6.MinPrefixInformationValidLifetimeForUpdate
+	savedMaxDesync := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
-		stack.MaxDesyncFactor = savedMaxDesync
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
+		ipv6.MaxDesyncFactor = savedMaxDesync
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
 	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
@@ -1931,16 +1928,17 @@ func TestAutoGenTempAddr(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						DupAddrDetectTransmits:     test.dupAddrTransmits,
-						RetransmitTimer:            test.retransmitTimer,
-						HandleRAs:                  true,
-						AutoGenGlobalAddresses:     true,
-						AutoGenTempGlobalAddresses: true,
-					},
-					NDPDisp:     &ndpDisp,
-					TempIIDSeed: seed,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							DupAddrDetectTransmits:     test.dupAddrTransmits,
+							RetransmitTimer:            test.retransmitTimer,
+							HandleRAs:                  true,
+							AutoGenGlobalAddresses:     true,
+							AutoGenTempGlobalAddresses: true,
+						},
+						NDPDisp:     &ndpDisp,
+						TempIIDSeed: seed,
+					})},
 				})
 
 				if err := s.CreateNIC(nicID, e); err != nil {
@@ -2119,11 +2117,11 @@ func TestAutoGenTempAddr(t *testing.T) {
 func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 	const nicID = 1
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
 	}()
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	tests := []struct {
 		name             string
@@ -2160,12 +2158,13 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						AutoGenTempGlobalAddresses: true,
-					},
-					NDPDisp:              &ndpDisp,
-					AutoGenIPv6LinkLocal: true,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							AutoGenTempGlobalAddresses: true,
+						},
+						NDPDisp:              &ndpDisp,
+						AutoGenIPv6LinkLocal: true,
+					})},
 				})
 
 				if err := s.CreateNIC(nicID, e); err != nil {
@@ -2211,11 +2210,11 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 		retransmitTimer = 2 * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
 	}()
-	stack.MaxDesyncFactor = 0
+	ipv6.MaxDesyncFactor = 0
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2228,15 +2227,16 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits:     dadTransmits,
-			RetransmitTimer:            retransmitTimer,
-			HandleRAs:                  true,
-			AutoGenGlobalAddresses:     true,
-			AutoGenTempGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits:     dadTransmits,
+				RetransmitTimer:            retransmitTimer,
+				HandleRAs:                  true,
+				AutoGenGlobalAddresses:     true,
+				AutoGenTempGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2294,17 +2294,17 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
-	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
-	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
-		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
-		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
 	}()
-	stack.MaxDesyncFactor = 0
-	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
-	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+	ipv6.MaxDesyncFactor = 0
+	ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2317,16 +2317,17 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 	}
 	e := channel.New(0, 1280, linkAddr1)
-	ndpConfigs := stack.NDPConfigurations{
+	ndpConfigs := ipv6.NDPConfigurations{
 		HandleRAs:                  true,
 		AutoGenGlobalAddresses:     true,
 		AutoGenTempGlobalAddresses: true,
 		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:       ndpConfigs,
-		NDPDisp:          &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ndpConfigs,
+			NDPDisp:    &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2382,8 +2383,11 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 
 	// Stop generating temporary addresses
 	ndpConfigs.AutoGenTempGlobalAddresses = false
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	if ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber); err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+	} else {
+		ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+		ndpEP.SetNDPConfigurations(ndpConfigs)
 	}
 
 	// Wait for all the temporary addresses to get invalidated.
@@ -2439,17 +2443,17 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 		newMinVLDuration = newMinVL * time.Second
 	)
 
-	savedMaxDesyncFactor := stack.MaxDesyncFactor
-	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
-	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	savedMaxDesyncFactor := ipv6.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := ipv6.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := ipv6.MinMaxTempAddrValidLifetime
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesyncFactor
-		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
-		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+		ipv6.MaxDesyncFactor = savedMaxDesyncFactor
+		ipv6.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		ipv6.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
 	}()
-	stack.MaxDesyncFactor = 0
-	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
-	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+	ipv6.MaxDesyncFactor = 0
+	ipv6.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	ipv6.MinMaxTempAddrValidLifetime = newMinVLDuration
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 	var tempIIDHistory [header.IIDSize]byte
@@ -2462,16 +2466,17 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 	}
 	e := channel.New(0, 1280, linkAddr1)
-	ndpConfigs := stack.NDPConfigurations{
+	ndpConfigs := ipv6.NDPConfigurations{
 		HandleRAs:                  true,
 		AutoGenGlobalAddresses:     true,
 		AutoGenTempGlobalAddresses: true,
 		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
 	}
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:       ndpConfigs,
-		NDPDisp:          &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ndpConfigs,
+			NDPDisp:    &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
@@ -2545,9 +2550,12 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 	// as paased.
 	ndpConfigs.MaxTempAddrValidLifetime = 100 * time.Second
 	ndpConfigs.MaxTempAddrPreferredLifetime = 100 * time.Second
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
 	}
+	ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+	ndpEP.SetNDPConfigurations(ndpConfigs)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
 	select {
 	case e := <-ndpDisp.autoGenAddrC:
@@ -2565,9 +2573,7 @@ func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout
 	ndpConfigs.MaxTempAddrValidLifetime = newLifetimes
 	ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes
-	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
-	}
+	ndpEP.SetNDPConfigurations(ndpConfigs)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
 	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
 }
@@ -2655,20 +2661,21 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
 			}
 			e := channel.New(0, 1280, linkAddr1)
-			ndpConfigs := stack.NDPConfigurations{
+			ndpConfigs := ipv6.NDPConfigurations{
 				HandleRAs:                     true,
 				AutoGenGlobalAddresses:        true,
 				AutoGenTempGlobalAddresses:    test.tempAddrs,
 				AutoGenAddressConflictRetries: 1,
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				NDPConfigs:         ndpConfigs,
-				NDPDisp:            &ndpDisp,
-				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-					NICNameFromID: test.nicNameFromID,
-				},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ndpConfigs,
+					NDPDisp:    &ndpDisp,
+					OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+						NICNameFromID: test.nicNameFromID,
+					},
+				})},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 
 			s.SetRouteTable([]tcpip.Route{{
@@ -2739,8 +2746,11 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 			ndpDisp.dadC = make(chan ndpDADEvent, 2)
 			ndpConfigs.DupAddrDetectTransmits = dupAddrTransmits
 			ndpConfigs.RetransmitTimer = retransmitTimer
-			if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
-				t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+			if ipv6Ep, err := s.GetNetworkEndpoint(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+			} else {
+				ndpEP := ipv6Ep.(ipv6.NDPEndpoint)
+				ndpEP.SetNDPConfigurations(ndpConfigs)
 			}
 
 			// Do SLAAC for prefix.
@@ -2754,9 +2764,7 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 			// DAD failure to restart the local generation process.
 			addr := test.addrs[maxSLAACAddrLocalRegenAttempts-1]
 			expectAutoGenAddrAsyncEvent(addr, newAddr)
-			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-			}
+			rxNDPSolicit(e, addr.Address)
 			select {
 			case e := <-ndpDisp.dadC:
 				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
@@ -2787,20 +2795,22 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 // stack.Stack will have a default route through the router (llAddr3) installed
 // and a static link-address (linkAddr3) added to the link address cache for the
 // router.
-func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID, useNeighborCache bool) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
 	t.Helper()
 	ndpDisp := &ndpDispatcher{
 		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: ndpDisp,
+		})},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+		UseNeighborCache:   useNeighborCache,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2810,7 +2820,11 @@ func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*nd
 		Gateway:     llAddr3,
 		NIC:         nicID,
 	}})
-	s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	if useNeighborCache {
+		s.AddStaticNeighbor(nicID, llAddr3, linkAddr3)
+	} else {
+		s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	}
 	return ndpDisp, e, s
 }
 
@@ -2884,110 +2898,128 @@ func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullA
 // TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
 // receiving a PI with 0 preferred lifetime.
 func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
-	const nicID = 1
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			const nicID = 1
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Deprecate addr for prefix1 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr1, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	// addr should still be the primary endpoint as there are no other addresses.
-	expectPrimaryAddr(addr1)
+			// Deprecate addr for prefix1 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr1, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			// addr should still be the primary endpoint as there are no other addresses.
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetimes of addr generated from prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes of addr generated from prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Deprecate addr for prefix2 immedaitely.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	expectAutoGenAddrEvent(addr2, deprecatedAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr1 should be the primary endpoint now since addr2 is deprecated but
-	// addr1 is not.
-	expectPrimaryAddr(addr1)
-	// addr2 is deprecated but if explicitly requested, it should be used.
-	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Deprecate addr for prefix2 immedaitely.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			expectAutoGenAddrEvent(addr2, deprecatedAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr1 should be the primary endpoint now since addr2 is deprecated but
+			// addr1 is not.
+			expectPrimaryAddr(addr1)
+			// addr2 is deprecated but if explicitly requested, it should be used.
+			fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Another PI w/ 0 preferred lifetime should not result in a deprecation
-	// event.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
-	}
+			// Another PI w/ 0 preferred lifetime should not result in a deprecation
+			// event.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+			}
 
-	// Refresh lifetimes of addr generated from prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
+			// Refresh lifetimes of addr generated from prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+		})
 	}
-	expectPrimaryAddr(addr2)
 }
 
 // TestAutoGenAddrJobDeprecation tests that an address is properly deprecated
@@ -2996,217 +3028,236 @@ func TestAutoGenAddrJobDeprecation(t *testing.T) {
 	const nicID = 1
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
-	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
-	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
+	}
 
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
+			defer func() {
+				ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
+			}()
+			ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
 
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
 			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
-	}
 
-	expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
-		t.Helper()
+			expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+				t.Helper()
 
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(timeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
 			}
-		case <-time.After(timeout):
-			t.Fatal("timed out waiting for addr auto gen event")
-		}
-	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Receive PI for prefix2.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Receive PI for prefix2.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI for prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr1)
+			// Receive a PI for prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr1)
 
-	// Refresh lifetime for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr1)
+			// Refresh lifetime for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since addr1 is deprecated but
-	// addr2 is not.
-	expectPrimaryAddr(addr2)
-	// addr1 is deprecated but if explicitly requested, it should be used.
-	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since addr1 is deprecated but
+			// addr2 is not.
+			expectPrimaryAddr(addr2)
+			// addr1 is deprecated but if explicitly requested, it should be used.
+			fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
-	// sure we do not get a deprecation event again.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+			// sure we do not get a deprecation event again.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Refresh lifetimes for addr of prefix1.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
-	// addr1 is the primary endpoint again since it is non-deprecated now.
-	expectPrimaryAddr(addr1)
+			// Refresh lifetimes for addr of prefix1.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
+			// addr1 is the primary endpoint again since it is non-deprecated now.
+			expectPrimaryAddr(addr1)
 
-	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	// addr2 should be the primary endpoint now since it is not deprecated.
-	expectPrimaryAddr(addr2)
-	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
-	}
+			// Wait for addr of prefix1 to be deprecated.
+			expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			// addr2 should be the primary endpoint now since it is not deprecated.
+			expectPrimaryAddr(addr2)
+			if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+				t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+			}
 
-	// Wait for addr of prefix1 to be invalidated.
-	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should have %s in the list of addresses", addr2)
-	}
-	expectPrimaryAddr(addr2)
+			// Wait for addr of prefix1 to be invalidated.
+			expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should have %s in the list of addresses", addr2)
+			}
+			expectPrimaryAddr(addr2)
 
-	// Refresh both lifetimes for addr of prefix2 to the same value.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto-generated event")
-	default:
-	}
+			// Refresh both lifetimes for addr of prefix2 to the same value.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			default:
+			}
 
-	// Wait for a deprecation then invalidation events, or just an invalidation
-	// event. We need to cover both cases but cannot deterministically hit both
-	// cases because the deprecation and invalidation handlers could be handled in
-	// either deprecation then invalidation, or invalidation then deprecation
-	// (which should be cancelled by the invalidation handler).
-	select {
-	case e := <-ndpDisp.autoGenAddrC:
-		if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
-			// If we get a deprecation event first, we should get an invalidation
-			// event almost immediately after.
+			// Wait for a deprecation then invalidation events, or just an invalidation
+			// event. We need to cover both cases but cannot deterministically hit both
+			// cases because the deprecation and invalidation handlers could be handled in
+			// either deprecation then invalidation, or invalidation then deprecation
+			// (which should be cancelled by the invalidation handler).
 			select {
 			case e := <-ndpDisp.autoGenAddrC:
-				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
-					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+					// If we get a deprecation event first, we should get an invalidation
+					// event almost immediately after.
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+					// If we get an invalidation  event first, we should not get a deprecation
+					// event after.
+					select {
+					case <-ndpDisp.autoGenAddrC:
+						t.Fatal("unexpectedly got an auto-generated event")
+					case <-time.After(defaultAsyncNegativeEventTimeout):
+					}
+				} else {
+					t.Fatalf("got unexpected auto-generated event")
 				}
-			case <-time.After(defaultAsyncPositiveEventTimeout):
+			case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for addr auto gen event")
 			}
-		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
-			// If we get an invalidation  event first, we should not get a deprecation
-			// event after.
-			select {
-			case <-ndpDisp.autoGenAddrC:
-				t.Fatal("unexpectedly got an auto-generated event")
-			case <-time.After(defaultAsyncNegativeEventTimeout):
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+				t.Fatalf("should not have %s in the list of addresses", addr1)
+			}
+			if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+				t.Fatalf("should not have %s in the list of addresses", addr2)
+			}
+			// Should not have any primary endpoints.
+			if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); got != want {
+				t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
+			}
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+			}
+			defer ep.Close()
+			if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+				t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
 			}
-		} else {
-			t.Fatalf("got unexpected auto-generated event")
-		}
-	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
-		t.Fatal("timed out waiting for addr auto gen event")
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
-		t.Fatalf("should not have %s in the list of addresses", addr1)
-	}
-	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
-		t.Fatalf("should not have %s in the list of addresses", addr2)
-	}
-	// Should not have any primary endpoints.
-	if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-		t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-	} else if want := (tcpip.AddressWithPrefix{}); got != want {
-		t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
-	}
-	wq := waiter.Queue{}
-	we, ch := waiter.NewChannelEntry(nil)
-	wq.EventRegister(&we, waiter.EventIn)
-	defer wq.EventUnregister(&we)
-	defer close(ch)
-	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
-	if err != nil {
-		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
-	}
-	defer ep.Close()
-	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
-	}
 
-	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
-		t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+				t.Errorf("got ep.Connect(%+v) = %s, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+			}
+		})
 	}
 }
 
@@ -3216,12 +3267,12 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 	const infiniteVLSeconds = 2
 	const minVLSeconds = 1
 	savedIL := header.NDPInfiniteLifetime
-	savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate
+	savedMinVL := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
 		header.NDPInfiniteLifetime = savedIL
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
 	header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
@@ -3265,12 +3316,13 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 				}
 				e := channel.New(0, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						HandleRAs:              true,
-						AutoGenGlobalAddresses: true,
-					},
-					NDPDisp: &ndpDisp,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							HandleRAs:              true,
+							AutoGenGlobalAddresses: true,
+						},
+						NDPDisp: &ndpDisp,
+					})},
 				})
 
 				if err := s.CreateNIC(1, e); err != nil {
@@ -3315,11 +3367,11 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 	const infiniteVL = 4294967295
 	const newMinVL = 4
-	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	saved := ipv6.MinPrefixInformationValidLifetimeForUpdate
 	defer func() {
-		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+		ipv6.MinPrefixInformationValidLifetimeForUpdate = saved
 	}()
-	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+	ipv6.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
 
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
@@ -3407,12 +3459,13 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				}
 				e := channel.New(10, 1280, linkAddr1)
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						HandleRAs:              true,
-						AutoGenGlobalAddresses: true,
-					},
-					NDPDisp: &ndpDisp,
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							HandleRAs:              true,
+							AutoGenGlobalAddresses: true,
+						},
+						NDPDisp: &ndpDisp,
+					})},
 				})
 
 				if err := s.CreateNIC(1, e); err != nil {
@@ -3473,12 +3526,13 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3524,110 +3578,128 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 func TestAutoGenAddrAfterRemoval(t *testing.T) {
 	const nicID = 1
 
-	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
-	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
-	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
-
-	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
-		t.Helper()
-
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
-				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
-			}
-		default:
-			t.Fatal("expected addr auto gen event")
-		}
+	stacks := []struct {
+		name             string
+		useNeighborCache bool
+	}{
+		{
+			name:             "linkAddrCache",
+			useNeighborCache: false,
+		},
+		{
+			name:             "neighborCache",
+			useNeighborCache: true,
+		},
 	}
 
-	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
-		t.Helper()
-
-		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
-			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
-		} else if got != addr {
-			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
-		}
+	for _, stackTyp := range stacks {
+		t.Run(stackTyp.name, func(t *testing.T) {
+			prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+			prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+			ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID, stackTyp.useNeighborCache)
 
-		if got := addrForNewConnection(t, s); got != addr.Address {
-			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
-		}
-	}
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
 
-	// Receive a PI to auto-generate addr1 with a large valid and preferred
-	// lifetime.
-	const largeLifetimeSeconds = 999
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr1, newAddr)
-	expectPrimaryAddr(addr1)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
 
-	// Add addr2 as a static address.
-	protoAddr2 := tcpip.ProtocolAddress{
-		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addr2,
-	}
-	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
-		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
-	}
-	// addr2 should be more preferred now since it is at the front of the primary
-	// list.
-	expectPrimaryAddr(addr2)
+			expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+				t.Helper()
 
-	// Get a route using addr2 to increment its reference count then remove it
-	// to leave it in the permanentExpired state.
-	r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
-	if err != nil {
-		t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
-	}
-	defer r.Release()
-	if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
-		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
-	}
-	// addr1 should be preferred again since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+				if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+					t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+				} else if got != addr {
+					t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+				}
 
-	// Receive a PI to auto-generate addr2 as valid and preferred.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr2 should be more preferred now that it is closer to the front of the
-	// primary list and not deprecated.
-	expectPrimaryAddr(addr2)
+				if got := addrForNewConnection(t, s); got != addr.Address {
+					t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+				}
+			}
 
-	// Removing the address should result in an invalidation event immediately.
-	// It should still be in the permanentExpired state because r is still held.
-	//
-	// We remove addr2 here to make sure addr2 was marked as a SLAAC address
-	// (it was previously marked as a static address).
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
-	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	// addr1 should be more preferred since addr2 is in the expired state.
-	expectPrimaryAddr(addr1)
+			// Receive a PI to auto-generate addr1 with a large valid and preferred
+			// lifetime.
+			const largeLifetimeSeconds = 999
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr1, newAddr)
+			expectPrimaryAddr(addr1)
 
-	// Receive a PI to auto-generate addr2 as valid and deprecated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
-	expectAutoGenAddrEvent(addr2, newAddr)
-	// addr1 should still be more preferred since addr2 is deprecated, even though
-	// it is closer to the front of the primary list.
-	expectPrimaryAddr(addr1)
+			// Add addr2 as a static address.
+			protoAddr2 := tcpip.ProtocolAddress{
+				Protocol:          header.IPv6ProtocolNumber,
+				AddressWithPrefix: addr2,
+			}
+			if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+				t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+			}
+			// addr2 should be more preferred now since it is at the front of the primary
+			// list.
+			expectPrimaryAddr(addr2)
 
-	// Receive a PI to refresh addr2's preferred lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Fatal("unexpectedly got an auto gen addr event")
-	default:
-	}
-	// addr2 should be more preferred now that it is not deprecated.
-	expectPrimaryAddr(addr2)
+			// Get a route using addr2 to increment its reference count then remove it
+			// to leave it in the permanentExpired state.
+			r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+			}
+			defer r.Release()
+			if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+				t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+			}
+			// addr1 should be preferred again since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and preferred.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr2 should be more preferred now that it is closer to the front of the
+			// primary list and not deprecated.
+			expectPrimaryAddr(addr2)
+
+			// Removing the address should result in an invalidation event immediately.
+			// It should still be in the permanentExpired state because r is still held.
+			//
+			// We remove addr2 here to make sure addr2 was marked as a SLAAC address
+			// (it was previously marked as a static address).
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			// addr1 should be more preferred since addr2 is in the expired state.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to auto-generate addr2 as valid and deprecated.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+			expectAutoGenAddrEvent(addr2, newAddr)
+			// addr1 should still be more preferred since addr2 is deprecated, even though
+			// it is closer to the front of the primary list.
+			expectPrimaryAddr(addr1)
+
+			// Receive a PI to refresh addr2's preferred lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto gen addr event")
+			default:
+			}
+			// addr2 should be more preferred now that it is not deprecated.
+			expectPrimaryAddr(addr2)
 
-	if err := s.RemoveAddress(1, addr2.Address); err != nil {
-		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			if err := s.RemoveAddress(1, addr2.Address); err != nil {
+				t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+			}
+			expectAutoGenAddrEvent(addr2, invalidatedAddr)
+			expectPrimaryAddr(addr1)
+		})
 	}
-	expectAutoGenAddrEvent(addr2, invalidatedAddr)
-	expectPrimaryAddr(addr1)
 }
 
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
@@ -3640,12 +3712,13 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3721,18 +3794,19 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			AutoGenGlobalAddresses: true,
-		},
-		NDPDisp: &ndpDisp,
-		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-				return nicName
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
 			},
-			SecretKey: secretKey,
-		},
+			NDPDisp: &ndpDisp,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+				SecretKey: secretKey,
+			},
+		})},
 	})
 	opts := stack.NICOptions{Name: nicName}
 	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -3796,11 +3870,11 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 	const lifetimeSeconds = 10
 
 	// Needed for the temporary address sub test.
-	savedMaxDesync := stack.MaxDesyncFactor
+	savedMaxDesync := ipv6.MaxDesyncFactor
 	defer func() {
-		stack.MaxDesyncFactor = savedMaxDesync
+		ipv6.MaxDesyncFactor = savedMaxDesync
 	}()
-	stack.MaxDesyncFactor = time.Nanosecond
+	ipv6.MaxDesyncFactor = time.Nanosecond
 
 	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
 	secretKey := secretKeyBuf[:]
@@ -3878,14 +3952,14 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 
 	addrTypes := []struct {
 		name             string
-		ndpConfigs       stack.NDPConfigurations
+		ndpConfigs       ipv6.NDPConfigurations
 		autoGenLinkLocal bool
 		prepareFn        func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix
 		addrGenFn        func(dadCounter uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix
 	}{
 		{
 			name: "Global address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: dadTransmits,
 				RetransmitTimer:        retransmitTimer,
 				HandleRAs:              true,
@@ -3903,7 +3977,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 		},
 		{
 			name: "LinkLocal address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits: dadTransmits,
 				RetransmitTimer:        retransmitTimer,
 			},
@@ -3917,7 +3991,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 		},
 		{
 			name: "Temporary address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:     dadTransmits,
 				RetransmitTimer:            retransmitTimer,
 				HandleRAs:                  true,
@@ -3969,16 +4043,17 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 						ndpConfigs := addrType.ndpConfigs
 						ndpConfigs.AutoGenAddressConflictRetries = maxRetries
 						s := stack.New(stack.Options{
-							NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-							AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-							NDPConfigs:           ndpConfigs,
-							NDPDisp:              &ndpDisp,
-							OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-								NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-									return nicName
+							NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+								AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+								NDPConfigs:           ndpConfigs,
+								NDPDisp:              &ndpDisp,
+								OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+									NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+										return nicName
+									},
+									SecretKey: secretKey,
 								},
-								SecretKey: secretKey,
-							},
+							})},
 						})
 						opts := stack.NICOptions{Name: nicName}
 						if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -3999,9 +4074,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 							}
 
 							// Simulate a DAD conflict.
-							if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-								t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-							}
+							rxNDPSolicit(e, addr.Address)
 							expectAutoGenAddrEvent(t, &ndpDisp, addr, invalidatedAddr)
 							expectDADEvent(t, &ndpDisp, addr.Address, false)
 
@@ -4059,14 +4132,14 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 
 	addrTypes := []struct {
 		name             string
-		ndpConfigs       stack.NDPConfigurations
+		ndpConfigs       ipv6.NDPConfigurations
 		autoGenLinkLocal bool
 		subnet           tcpip.Subnet
 		triggerSLAACFn   func(e *channel.Endpoint)
 	}{
 		{
 			name: "Global address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:        dadTransmits,
 				RetransmitTimer:               retransmitTimer,
 				HandleRAs:                     true,
@@ -4082,7 +4155,7 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 		},
 		{
 			name: "LinkLocal address",
-			ndpConfigs: stack.NDPConfigurations{
+			ndpConfigs: ipv6.NDPConfigurations{
 				DupAddrDetectTransmits:        dadTransmits,
 				RetransmitTimer:               retransmitTimer,
 				AutoGenAddressConflictRetries: maxRetries,
@@ -4105,10 +4178,11 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
-				NDPConfigs:           addrType.ndpConfigs,
-				NDPDisp:              &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+					NDPConfigs:           addrType.ndpConfigs,
+					NDPDisp:              &ndpDisp,
+				})},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -4138,9 +4212,7 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			expectAutoGenAddrEvent(addr, newAddr)
 
 			// Simulate a DAD conflict.
-			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-			}
+			rxNDPSolicit(e, addr.Address)
 			expectAutoGenAddrEvent(addr, invalidatedAddr)
 			select {
 			case e := <-ndpDisp.dadC:
@@ -4190,21 +4262,22 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits:        dadTransmits,
-			RetransmitTimer:               retransmitTimer,
-			HandleRAs:                     true,
-			AutoGenGlobalAddresses:        true,
-			AutoGenAddressConflictRetries: maxRetries,
-		},
-		NDPDisp: &ndpDisp,
-		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
-			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
-				return nicName
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenAddressConflictRetries: maxRetries,
 			},
-			SecretKey: secretKey,
-		},
+			NDPDisp: &ndpDisp,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+				SecretKey: secretKey,
+			},
+		})},
 	})
 	opts := stack.NICOptions{Name: nicName}
 	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
@@ -4236,9 +4309,7 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 
 	// Simulate a DAD conflict after some time has passed.
 	time.Sleep(failureTimer)
-	if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
-		t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
-	}
+	rxNDPSolicit(e, addr.Address)
 	expectAutoGenAddrEvent(addr, invalidatedAddr)
 	select {
 	case e := <-ndpDisp.dadC:
@@ -4399,11 +4470,12 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 			}
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs: true,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs: true,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
 			if err := s.CreateNIC(1, e); err != nil {
 				t.Fatalf("CreateNIC(1) = %s", err)
@@ -4449,11 +4521,12 @@ func TestNDPDNSSearchListDispatch(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -4580,7 +4653,7 @@ func TestCleanupNDPState(t *testing.T) {
 			name: "Enable forwarding",
 			cleanupFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
-				s.SetForwarding(true)
+				s.SetForwarding(ipv6.ProtocolNumber, true)
 			},
 			keepAutoGenLinkLocal: true,
 			maxAutoGenAddrEvents: 4,
@@ -4634,15 +4707,16 @@ func TestCleanupNDPState(t *testing.T) {
 				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: true,
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:              true,
-					DiscoverDefaultRouters: true,
-					DiscoverOnLinkPrefixes: true,
-					AutoGenGlobalAddresses: true,
-				},
-				NDPDisp: &ndpDisp,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: true,
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:              true,
+						DiscoverDefaultRouters: true,
+						DiscoverOnLinkPrefixes: true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})},
 			})
 
 			expectRouterEvent := func() (bool, ndpRouterEvent) {
@@ -4907,18 +4981,19 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	}
 	e := channel.New(0, 1280, linkAddr1)
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs: true,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				HandleRAs: true,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	})
 
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 	}
 
-	expectDHCPv6Event := func(configuration stack.DHCPv6ConfigurationFromNDPRA) {
+	expectDHCPv6Event := func(configuration ipv6.DHCPv6ConfigurationFromNDPRA) {
 		t.Helper()
 		select {
 		case e := <-ndpDisp.dhcpv6ConfigurationC:
@@ -4942,7 +5017,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Even if the first RA reports no DHCPv6 configurations are available, the
 	// dispatcher should get an event.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
-	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	expectDHCPv6Event(ipv6.DHCPv6NoConfiguration)
 	// Receiving the same update again should not result in an event to the
 	// dispatcher.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
@@ -4951,19 +5026,19 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 
 	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
-	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	expectDHCPv6Event(ipv6.DHCPv6ManagedAddress)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
 	expectNoDHCPv6Event()
 
 	// Receive an RA that updates the DHCPv6 configuration to none.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
-	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	expectDHCPv6Event(ipv6.DHCPv6NoConfiguration)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
 	expectNoDHCPv6Event()
 
@@ -4971,7 +5046,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	//
 	// Note, when the M flag is set, the O flag is redundant.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
-	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	expectDHCPv6Event(ipv6.DHCPv6ManagedAddress)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
 	expectNoDHCPv6Event()
 	// Even though the DHCPv6 flags are different, the effective configuration is
@@ -4984,7 +5059,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 
@@ -4999,7 +5074,7 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Receive an RA that updates the DHCPv6 configuration to Other
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
-	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	expectDHCPv6Event(ipv6.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 }
@@ -5157,12 +5232,13 @@ func TestRouterSolicitation(t *testing.T) {
 					}
 				}
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-					NDPConfigs: stack.NDPConfigurations{
-						MaxRtrSolicitations:     test.maxRtrSolicit,
-						RtrSolicitationInterval: test.rtrSolicitInt,
-						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
-					},
+					NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+						NDPConfigs: ipv6.NDPConfigurations{
+							MaxRtrSolicitations:     test.maxRtrSolicit,
+							RtrSolicitationInterval: test.rtrSolicitInt,
+							MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+						},
+					})},
 				})
 				if err := s.CreateNIC(nicID, &e); err != nil {
 					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -5226,11 +5302,11 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			name: "Enable and disable forwarding",
 			startFn: func(t *testing.T, s *stack.Stack) {
 				t.Helper()
-				s.SetForwarding(false)
+				s.SetForwarding(ipv6.ProtocolNumber, false)
 			},
 			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
 				t.Helper()
-				s.SetForwarding(true)
+				s.SetForwarding(ipv6.ProtocolNumber, true)
 			},
 		},
 
@@ -5297,12 +5373,13 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 					checker.NDPRS())
 			}
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					MaxRtrSolicitations:     maxRtrSolicitations,
-					RtrSolicitationInterval: interval,
-					MaxRtrSolicitationDelay: delay,
-				},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						MaxRtrSolicitations:     maxRtrSolicitations,
+						RtrSolicitationInterval: interval,
+						MaxRtrSolicitationDelay: delay,
+					},
+				})},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go
index 27e1feec0..eebf43a1f 100644
--- a/pkg/tcpip/stack/neighbor_cache.go
+++ b/pkg/tcpip/stack/neighbor_cache.go
@@ -68,7 +68,7 @@ var _ NUDHandler = (*neighborCache)(nil)
 // reset to state incomplete, and returned. If no matching entry exists and the
 // cache is not full, a new entry with state incomplete is allocated and
 // returned.
-func (n *neighborCache) getOrCreateEntry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver) *neighborEntry {
+func (n *neighborCache) getOrCreateEntry(remoteAddr tcpip.Address, linkRes LinkAddressResolver) *neighborEntry {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
@@ -84,7 +84,7 @@ func (n *neighborCache) getOrCreateEntry(remoteAddr, localAddr tcpip.Address, li
 
 	// The entry that needs to be created must be dynamic since all static
 	// entries are directly added to the cache via addStaticEntry.
-	entry := newNeighborEntry(n.nic, remoteAddr, localAddr, n.state, linkRes)
+	entry := newNeighborEntry(n.nic, remoteAddr, n.state, linkRes)
 	if n.dynamic.count == neighborCacheSize {
 		e := n.dynamic.lru.Back()
 		e.mu.Lock()
@@ -111,6 +111,10 @@ func (n *neighborCache) getOrCreateEntry(remoteAddr, localAddr tcpip.Address, li
 // provided, it will be notified when address resolution is complete (success
 // or not).
 //
+// If specified, the local address must be an address local to the interface the
+// neighbor cache belongs to. The local address is the source address of a
+// packet prompting NUD/link address resolution.
+//
 // If address resolution is required, ErrNoLinkAddress and a notification
 // channel is returned for the top level caller to block. Channel is closed
 // once address resolution is complete (success or not).
@@ -118,7 +122,6 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA
 	if linkAddr, ok := linkRes.ResolveStaticAddress(remoteAddr); ok {
 		e := NeighborEntry{
 			Addr:      remoteAddr,
-			LocalAddr: localAddr,
 			LinkAddr:  linkAddr,
 			State:     Static,
 			UpdatedAt: time.Now(),
@@ -126,15 +129,22 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA
 		return e, nil, nil
 	}
 
-	entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes)
+	entry := n.getOrCreateEntry(remoteAddr, linkRes)
 	entry.mu.Lock()
 	defer entry.mu.Unlock()
 
 	switch s := entry.neigh.State; s {
-	case Reachable, Static:
+	case Stale:
+		entry.handlePacketQueuedLocked(localAddr)
+		fallthrough
+	case Reachable, Static, Delay, Probe:
+		// As per RFC 4861 section 7.3.3:
+		//  "Neighbor Unreachability Detection operates in parallel with the sending
+		//   of packets to a neighbor. While reasserting a neighbor's reachability,
+		//   a node continues sending packets to that neighbor using the cached
+		//   link-layer address."
 		return entry.neigh, nil, nil
-
-	case Unknown, Incomplete, Stale, Delay, Probe:
+	case Unknown, Incomplete:
 		entry.addWakerLocked(w)
 
 		if entry.done == nil {
@@ -145,12 +155,10 @@ func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkA
 			entry.done = make(chan struct{})
 		}
 
-		entry.handlePacketQueuedLocked()
+		entry.handlePacketQueuedLocked(localAddr)
 		return entry.neigh, entry.done, tcpip.ErrWouldBlock
-
 	case Failed:
 		return entry.neigh, nil, tcpip.ErrNoLinkAddress
-
 	default:
 		panic(fmt.Sprintf("Invalid cache entry state: %s", s))
 	}
@@ -202,7 +210,7 @@ func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAd
 		} else {
 			// Static entry found with the same address but different link address.
 			entry.neigh.LinkAddr = linkAddr
-			entry.dispatchChangeEventLocked(entry.neigh.State)
+			entry.dispatchChangeEventLocked()
 			entry.mu.Unlock()
 			return
 		}
@@ -215,8 +223,7 @@ func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAd
 		entry.mu.Unlock()
 	}
 
-	entry := newStaticNeighborEntry(n.nic, addr, linkAddr, n.state)
-	n.cache[addr] = entry
+	n.cache[addr] = newStaticNeighborEntry(n.nic, addr, linkAddr, n.state)
 }
 
 // removeEntryLocked removes the specified entry from the neighbor cache.
@@ -287,8 +294,8 @@ func (n *neighborCache) setConfig(config NUDConfigurations) {
 // HandleProbe implements NUDHandler.HandleProbe by following the logic defined
 // in RFC 4861 section 7.2.3. Validation of the probe is expected to be handled
 // by the caller.
-func (n *neighborCache) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver) {
-	entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes)
+func (n *neighborCache) HandleProbe(remoteAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver) {
+	entry := n.getOrCreateEntry(remoteAddr, linkRes)
 	entry.mu.Lock()
 	entry.handleProbeLocked(remoteLinkAddr)
 	entry.mu.Unlock()
diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go
index b4fa69e3e..d81f00848 100644
--- a/pkg/tcpip/stack/neighbor_cache_test.go
+++ b/pkg/tcpip/stack/neighbor_cache_test.go
@@ -30,6 +30,7 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
 )
 
 const (
@@ -127,9 +128,8 @@ func newTestEntryStore() *testEntryStore {
 		linkAddr := toLinkAddress(i)
 
 		store.entriesMap[addr] = NeighborEntry{
-			Addr:      addr,
-			LocalAddr: testEntryLocalAddr,
-			LinkAddr:  linkAddr,
+			Addr:     addr,
+			LinkAddr: linkAddr,
 		}
 	}
 	return store
@@ -194,10 +194,10 @@ type testNeighborResolver struct {
 
 var _ LinkAddressResolver = (*testNeighborResolver)(nil)
 
-func (r *testNeighborResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
+func (r *testNeighborResolver) LinkAddressRequest(targetAddr, _ tcpip.Address, _ tcpip.LinkAddress, _ NetworkInterface) *tcpip.Error {
 	// Delay handling the request to emulate network latency.
 	r.clock.AfterFunc(r.delay, func() {
-		r.fakeRequest(addr)
+		r.fakeRequest(targetAddr)
 	})
 
 	// Execute post address resolution action, if available.
@@ -239,7 +239,7 @@ type entryEvent struct {
 func TestNeighborCacheGetConfig(t *testing.T) {
 	nudDisp := testNUDDispatcher{}
 	c := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 
 	if got, want := neigh.config(), c; got != want {
@@ -257,7 +257,7 @@ func TestNeighborCacheGetConfig(t *testing.T) {
 func TestNeighborCacheSetConfig(t *testing.T) {
 	nudDisp := testNUDDispatcher{}
 	c := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 
 	c.MinRandomFactor = 1
@@ -279,7 +279,7 @@ func TestNeighborCacheSetConfig(t *testing.T) {
 func TestNeighborCacheEntry(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, c, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -293,26 +293,29 @@ func TestNeighborCacheEntry(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -323,8 +326,8 @@ func TestNeighborCacheEntry(t *testing.T) {
 		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
 	}
 
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != nil {
-		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != nil {
+		t.Fatalf("unexpected error from neigh.entry(%s, '', _, nil): %s", entry.Addr, err)
 	}
 
 	// No more events should have been dispatched.
@@ -339,7 +342,7 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -353,26 +356,30 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -390,9 +397,11 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 		}
 		nudDisp.mu.Lock()
@@ -403,13 +412,13 @@ func TestNeighborCacheRemoveEntry(t *testing.T) {
 		}
 	}
 
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
-		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 }
 
 type testContext struct {
-	clock   *fakeClock
+	clock   *faketime.ManualClock
 	neigh   *neighborCache
 	store   *testEntryStore
 	linkRes *testNeighborResolver
@@ -418,7 +427,7 @@ type testContext struct {
 
 func newTestContext(c NUDConfigurations) testContext {
 	nudDisp := &testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nudDisp, c, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -451,10 +460,10 @@ func (c *testContext) overflowCache(opts overflowOptions) error {
 		if !ok {
 			return fmt.Errorf("c.store.entry(%d) not found", i)
 		}
-		if _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil); err != tcpip.ErrWouldBlock {
-			return fmt.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		if _, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil); err != tcpip.ErrWouldBlock {
+			return fmt.Errorf("got c.neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 		}
-		c.clock.advance(c.neigh.config().RetransmitTimer)
+		c.clock.Advance(c.neigh.config().RetransmitTimer)
 
 		var wantEvents []testEntryEventInfo
 
@@ -469,23 +478,29 @@ func (c *testContext) overflowCache(opts overflowOptions) error {
 			wantEvents = append(wantEvents, testEntryEventInfo{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      removedEntry.Addr,
-				LinkAddr:  removedEntry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     removedEntry.Addr,
+					LinkAddr: removedEntry.LinkAddr,
+					State:    Reachable,
+				},
 			})
 		}
 
 		wantEvents = append(wantEvents, testEntryEventInfo{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		}, testEntryEventInfo{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		})
 
 		c.nudDisp.mu.Lock()
@@ -507,10 +522,9 @@ func (c *testContext) overflowCache(opts overflowOptions) error {
 			return fmt.Errorf("c.store.entry(%d) not found", i)
 		}
 		wantEntry := NeighborEntry{
-			Addr:      entry.Addr,
-			LocalAddr: entry.LocalAddr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Addr:     entry.Addr,
+			LinkAddr: entry.LinkAddr,
+			State:    Reachable,
 		}
 		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
 	}
@@ -563,24 +577,27 @@ func TestNeighborCacheRemoveEntryThenOverflow(t *testing.T) {
 	if !ok {
 		t.Fatalf("c.store.entry(0) not found")
 	}
-	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(c.neigh.config().RetransmitTimer)
+	c.clock.Advance(c.neigh.config().RetransmitTimer)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -599,9 +616,11 @@ func TestNeighborCacheRemoveEntryThenOverflow(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 		}
 		c.nudDisp.mu.Lock()
@@ -639,9 +658,11 @@ func TestNeighborCacheDuplicateStaticEntryWithSameLinkAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  staticLinkAddr,
-			State:     Static,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: staticLinkAddr,
+				State:    Static,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -681,9 +702,11 @@ func TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress(t *testing.T)
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  staticLinkAddr,
-			State:     Static,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: staticLinkAddr,
+				State:    Static,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -702,9 +725,11 @@ func TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress(t *testing.T)
 			{
 				EventType: entryTestChanged,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  staticLinkAddr,
-				State:     Static,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: staticLinkAddr,
+					State:    Static,
+				},
 			},
 		}
 		c.nudDisp.mu.Lock()
@@ -739,9 +764,11 @@ func TestNeighborCacheRemoveStaticEntryThenOverflow(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  staticLinkAddr,
-			State:     Static,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: staticLinkAddr,
+				State:    Static,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -759,9 +786,11 @@ func TestNeighborCacheRemoveStaticEntryThenOverflow(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  staticLinkAddr,
-				State:     Static,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: staticLinkAddr,
+					State:    Static,
+				},
 			},
 		}
 		c.nudDisp.mu.Lock()
@@ -799,24 +828,27 @@ func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) {
 	if !ok {
 		t.Fatalf("c.store.entry(0) not found")
 	}
-	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(typicalLatency)
+	c.clock.Advance(typicalLatency)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -835,16 +867,20 @@ func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 			{
 				EventType: entryTestAdded,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  staticLinkAddr,
-				State:     Static,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: staticLinkAddr,
+					State:    Static,
+				},
 			},
 		}
 		c.nudDisp.mu.Lock()
@@ -860,10 +896,9 @@ func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) {
 		startAtEntryIndex: 1,
 		wantStaticEntries: []NeighborEntry{
 			{
-				Addr:      entry.Addr,
-				LocalAddr: "", // static entries don't need a local address
-				LinkAddr:  staticLinkAddr,
-				State:     Static,
+				Addr:     entry.Addr,
+				LinkAddr: staticLinkAddr,
+				State:    Static,
 			},
 		},
 	}
@@ -876,7 +911,7 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -895,14 +930,14 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w)
+	_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, &w)
 	if err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, _ = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		t.Fatalf("got neigh.entry(%s, '', _, _ = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 	if doneCh == nil {
-		t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+		t.Fatalf("expected done channel from neigh.entry(%s, '', _, _)", entry.Addr)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	select {
 	case <-doneCh:
@@ -912,7 +947,7 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 
 	id, ok := s.Fetch(false /* block */)
 	if !ok {
-		t.Errorf("expected waker to be notified after neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+		t.Errorf("expected waker to be notified after neigh.entry(%s, '', _, _)", entry.Addr)
 	}
 	if id != wakerID {
 		t.Errorf("got s.Fetch(false) = %d, want = %d", id, wakerID)
@@ -922,15 +957,19 @@ func TestNeighborCacheNotifiesWaker(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -944,7 +983,7 @@ func TestNeighborCacheRemoveWaker(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -963,18 +1002,18 @@ func TestNeighborCacheRemoveWaker(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w)
+	_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, &w)
 	if err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, _) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		t.Fatalf("got neigh.entry(%s, '', _, _) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 	if doneCh == nil {
-		t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+		t.Fatalf("expected done channel from neigh.entry(%s, '', _, _)", entry.Addr)
 	}
 
 	// Remove the waker before the neighbor cache has the opportunity to send a
 	// notification.
 	neigh.removeWaker(entry.Addr, &w)
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	select {
 	case <-doneCh:
@@ -990,15 +1029,19 @@ func TestNeighborCacheRemoveWaker(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1027,10 +1070,9 @@ func TestNeighborCacheAddStaticEntryThenOverflow(t *testing.T) {
 		t.Errorf("unexpected error from c.neigh.entry(%s, \"\", _, nil): %s", entry.Addr, err)
 	}
 	want := NeighborEntry{
-		Addr:      entry.Addr,
-		LocalAddr: "", // static entries don't need a local address
-		LinkAddr:  entry.LinkAddr,
-		State:     Static,
+		Addr:     entry.Addr,
+		LinkAddr: entry.LinkAddr,
+		State:    Static,
 	}
 	if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
 		t.Errorf("c.neigh.entry(%s, \"\", _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
@@ -1040,9 +1082,11 @@ func TestNeighborCacheAddStaticEntryThenOverflow(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Static,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Static,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -1057,10 +1101,9 @@ func TestNeighborCacheAddStaticEntryThenOverflow(t *testing.T) {
 		startAtEntryIndex: 1,
 		wantStaticEntries: []NeighborEntry{
 			{
-				Addr:      entry.Addr,
-				LocalAddr: "", // static entries don't need a local address
-				LinkAddr:  entry.LinkAddr,
-				State:     Static,
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Static,
 			},
 		},
 	}
@@ -1073,7 +1116,7 @@ func TestNeighborCacheClear(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1088,25 +1131,28 @@ func TestNeighborCacheClear(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1125,9 +1171,11 @@ func TestNeighborCacheClear(t *testing.T) {
 			{
 				EventType: entryTestAdded,
 				NICID:     1,
-				Addr:      entryTestAddr1,
-				LinkAddr:  entryTestLinkAddr1,
-				State:     Static,
+				Entry: NeighborEntry{
+					Addr:     entryTestAddr1,
+					LinkAddr: entryTestLinkAddr1,
+					State:    Static,
+				},
 			},
 		}
 		nudDisp.mu.Lock()
@@ -1148,16 +1196,20 @@ func TestNeighborCacheClear(t *testing.T) {
 		{
 			EventType: entryTestRemoved,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestRemoved,
 			NICID:     1,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Static,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Static,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1184,24 +1236,27 @@ func TestNeighborCacheClearThenOverflow(t *testing.T) {
 	if !ok {
 		t.Fatalf("c.store.entry(0) not found")
 	}
-	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
-	if err != tcpip.ErrWouldBlock {
-		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	c.clock.advance(typicalLatency)
+	c.clock.Advance(typicalLatency)
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     1,
-			Addr:      entry.Addr,
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:  entry.Addr,
+				State: Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     1,
-			Addr:      entry.Addr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entry.Addr,
+				LinkAddr: entry.LinkAddr,
+				State:    Reachable,
+			},
 		},
 	}
 	c.nudDisp.mu.Lock()
@@ -1219,9 +1274,11 @@ func TestNeighborCacheClearThenOverflow(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 		}
 		c.nudDisp.mu.Lock()
@@ -1249,7 +1306,7 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 	config.MaxRandomFactor = 1
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1273,29 +1330,33 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 		if !ok {
 			t.Fatalf("store.entry(%d) not found", i)
 		}
-		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, nil)
 		if err != tcpip.ErrWouldBlock {
-			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+			t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 		}
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 		select {
 		case <-doneCh:
 		default:
-			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, '', _, nil)", entry.Addr)
 		}
 		wantEvents := []testEntryEventInfo{
 			{
 				EventType: entryTestAdded,
 				NICID:     1,
-				Addr:      entry.Addr,
-				State:     Incomplete,
+				Entry: NeighborEntry{
+					Addr:  entry.Addr,
+					State: Incomplete,
+				},
 			},
 			{
 				EventType: entryTestChanged,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 		}
 		nudDisp.mu.Lock()
@@ -1311,9 +1372,8 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 	for i := neighborCacheSize; i < store.size(); i++ {
 		// Periodically refresh the frequently used entry
 		if i%(neighborCacheSize/2) == 0 {
-			_, _, err := neigh.entry(frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, linkRes, nil)
-			if err != nil {
-				t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, err)
+			if _, _, err := neigh.entry(frequentlyUsedEntry.Addr, "", linkRes, nil); err != nil {
+				t.Errorf("unexpected error from neigh.entry(%s, '', _, nil): %s", frequentlyUsedEntry.Addr, err)
 			}
 		}
 
@@ -1321,15 +1381,15 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 		if !ok {
 			t.Fatalf("store.entry(%d) not found", i)
 		}
-		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, nil)
 		if err != tcpip.ErrWouldBlock {
-			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+			t.Errorf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 		}
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 		select {
 		case <-doneCh:
 		default:
-			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, '', _, nil)", entry.Addr)
 		}
 
 		// An entry should have been removed, as per the LRU eviction strategy
@@ -1341,22 +1401,28 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 			{
 				EventType: entryTestRemoved,
 				NICID:     1,
-				Addr:      removedEntry.Addr,
-				LinkAddr:  removedEntry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     removedEntry.Addr,
+					LinkAddr: removedEntry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 			{
 				EventType: entryTestAdded,
 				NICID:     1,
-				Addr:      entry.Addr,
-				State:     Incomplete,
+				Entry: NeighborEntry{
+					Addr:  entry.Addr,
+					State: Incomplete,
+				},
 			},
 			{
 				EventType: entryTestChanged,
 				NICID:     1,
-				Addr:      entry.Addr,
-				LinkAddr:  entry.LinkAddr,
-				State:     Reachable,
+				Entry: NeighborEntry{
+					Addr:     entry.Addr,
+					LinkAddr: entry.LinkAddr,
+					State:    Reachable,
+				},
 			},
 		}
 		nudDisp.mu.Lock()
@@ -1373,10 +1439,9 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 	// have to be sorted before comparison.
 	wantUnsortedEntries := []NeighborEntry{
 		{
-			Addr:      frequentlyUsedEntry.Addr,
-			LocalAddr: frequentlyUsedEntry.LocalAddr,
-			LinkAddr:  frequentlyUsedEntry.LinkAddr,
-			State:     Reachable,
+			Addr:     frequentlyUsedEntry.Addr,
+			LinkAddr: frequentlyUsedEntry.LinkAddr,
+			State:    Reachable,
 		},
 	}
 
@@ -1386,10 +1451,9 @@ func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
 			t.Fatalf("store.entry(%d) not found", i)
 		}
 		wantEntry := NeighborEntry{
-			Addr:      entry.Addr,
-			LocalAddr: entry.LocalAddr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Addr:     entry.Addr,
+			LinkAddr: entry.LinkAddr,
+			State:    Reachable,
 		}
 		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
 	}
@@ -1412,7 +1476,7 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1429,9 +1493,8 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 			wg.Add(1)
 			go func(entry NeighborEntry) {
 				defer wg.Done()
-				e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-				if err != nil && err != tcpip.ErrWouldBlock {
-					t.Errorf("got neigh.entry(%s, %s, _, nil) = (%+v, _, %s), want (_, _, nil) or (_, _, %s)", entry.Addr, entry.LocalAddr, e, err, tcpip.ErrWouldBlock)
+				if e, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != nil && err != tcpip.ErrWouldBlock {
+					t.Errorf("got neigh.entry(%s, '', _, nil) = (%+v, _, %s), want (_, _, nil) or (_, _, %s)", entry.Addr, e, err, tcpip.ErrWouldBlock)
 				}
 			}(entry)
 		}
@@ -1440,7 +1503,7 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 		wg.Wait()
 
 		// Process all the requests for a single entry concurrently
-		clock.advance(typicalLatency)
+		clock.Advance(typicalLatency)
 	}
 
 	// All goroutines add in the same order and add more values than can fit in
@@ -1455,10 +1518,9 @@ func TestNeighborCacheConcurrent(t *testing.T) {
 			t.Errorf("store.entry(%d) not found", i)
 		}
 		wantEntry := NeighborEntry{
-			Addr:      entry.Addr,
-			LocalAddr: entry.LocalAddr,
-			LinkAddr:  entry.LinkAddr,
-			State:     Reachable,
+			Addr:     entry.Addr,
+			LinkAddr: entry.LinkAddr,
+			State:    Reachable,
 		}
 		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
 	}
@@ -1472,7 +1534,7 @@ func TestNeighborCacheReplace(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1487,36 +1549,37 @@ func TestNeighborCacheReplace(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, nil)
 	if err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
+	clock.Advance(typicalLatency)
 	select {
 	case <-doneCh:
 	default:
-		t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		t.Fatalf("expected notification from done channel returned by neigh.entry(%s, '', _, nil)", entry.Addr)
 	}
 
 	// Verify the entry exists
-	e, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-	if err != nil {
-		t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
-	}
-	if doneCh != nil {
-		t.Errorf("unexpected done channel from neigh.entry(%s, %s, _, nil): %v", entry.Addr, entry.LocalAddr, doneCh)
-	}
-	if t.Failed() {
-		t.FailNow()
-	}
-	want := NeighborEntry{
-		Addr:      entry.Addr,
-		LocalAddr: entry.LocalAddr,
-		LinkAddr:  entry.LinkAddr,
-		State:     Reachable,
-	}
-	if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
-		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LinkAddr, diff)
+	{
+		e, doneCh, err := neigh.entry(entry.Addr, "", linkRes, nil)
+		if err != nil {
+			t.Errorf("unexpected error from neigh.entry(%s, '', _, nil): %s", entry.Addr, err)
+		}
+		if doneCh != nil {
+			t.Errorf("unexpected done channel from neigh.entry(%s, '', _, nil): %v", entry.Addr, doneCh)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+		want := NeighborEntry{
+			Addr:     entry.Addr,
+			LinkAddr: entry.LinkAddr,
+			State:    Reachable,
+		}
+		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+			t.Errorf("neigh.entry(%s, '', _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
+		}
 	}
 
 	// Notify of a link address change
@@ -1535,35 +1598,39 @@ func TestNeighborCacheReplace(t *testing.T) {
 		IsRouter:  false,
 	})
 
-	// Requesting the entry again should start address resolution
+	// Requesting the entry again should start neighbor reachability confirmation.
+	//
+	// Verify the entry's new link address and the new state.
 	{
-		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-		if err != tcpip.ErrWouldBlock {
-			t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		e, _, err := neigh.entry(entry.Addr, "", linkRes, nil)
+		if err != nil {
+			t.Fatalf("neigh.entry(%s, '', _, nil): %s", entry.Addr, err)
 		}
-		clock.advance(config.DelayFirstProbeTime + typicalLatency)
-		select {
-		case <-doneCh:
-		default:
-			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		want := NeighborEntry{
+			Addr:     entry.Addr,
+			LinkAddr: updatedLinkAddr,
+			State:    Delay,
+		}
+		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+			t.Errorf("neigh.entry(%s, '', _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
 		}
+		clock.Advance(config.DelayFirstProbeTime + typicalLatency)
 	}
 
-	// Verify the entry's new link address
+	// Verify that the neighbor is now reachable.
 	{
-		e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
-		clock.advance(typicalLatency)
+		e, _, err := neigh.entry(entry.Addr, "", linkRes, nil)
+		clock.Advance(typicalLatency)
 		if err != nil {
-			t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+			t.Errorf("unexpected error from neigh.entry(%s, '', _, nil): %s", entry.Addr, err)
 		}
-		want = NeighborEntry{
-			Addr:      entry.Addr,
-			LocalAddr: entry.LocalAddr,
-			LinkAddr:  updatedLinkAddr,
-			State:     Reachable,
+		want := NeighborEntry{
+			Addr:     entry.Addr,
+			LinkAddr: updatedLinkAddr,
+			State:    Reachable,
 		}
 		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
-			t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff)
+			t.Errorf("neigh.entry(%s, '', _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
 		}
 	}
 }
@@ -1572,7 +1639,7 @@ func TestNeighborCacheResolutionFailed(t *testing.T) {
 	config := DefaultNUDConfigurations()
 
 	nudDisp := testNUDDispatcher{}
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(&nudDisp, config, clock)
 	store := newTestEntryStore()
 
@@ -1592,35 +1659,34 @@ func TestNeighborCacheResolutionFailed(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
-	clock.advance(typicalLatency)
-	got, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	clock.Advance(typicalLatency)
+	got, _, err := neigh.entry(entry.Addr, "", linkRes, nil)
 	if err != nil {
-		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+		t.Fatalf("unexpected error from neigh.entry(%s, '', _, nil): %s", entry.Addr, err)
 	}
 	want := NeighborEntry{
-		Addr:      entry.Addr,
-		LocalAddr: entry.LocalAddr,
-		LinkAddr:  entry.LinkAddr,
-		State:     Reachable,
+		Addr:     entry.Addr,
+		LinkAddr: entry.LinkAddr,
+		State:    Reachable,
 	}
 	if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" {
-		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff)
+		t.Errorf("neigh.entry(%s, '', _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
 	}
 
 	// Verify that address resolution for an unknown address returns ErrNoLinkAddress
 	before := atomic.LoadUint32(&requestCount)
 
 	entry.Addr += "2"
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 	waitFor := config.DelayFirstProbeTime + typicalLatency*time.Duration(config.MaxMulticastProbes)
-	clock.advance(waitFor)
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
+	clock.Advance(waitFor)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrNoLinkAddress {
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrNoLinkAddress)
 	}
 
 	maxAttempts := neigh.config().MaxUnicastProbes
@@ -1636,7 +1702,7 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 	config := DefaultNUDConfigurations()
 	config.RetransmitTimer = time.Millisecond // small enough to cause timeout
 
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nil, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1650,13 +1716,13 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 	if !ok {
 		t.Fatalf("store.entry(0) not found")
 	}
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 	}
 	waitFor := config.RetransmitTimer * time.Duration(config.MaxMulticastProbes)
-	clock.advance(waitFor)
-	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
-		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
+	clock.Advance(waitFor)
+	if _, _, err := neigh.entry(entry.Addr, "", linkRes, nil); err != tcpip.ErrNoLinkAddress {
+		t.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrNoLinkAddress)
 	}
 }
 
@@ -1664,7 +1730,7 @@ func TestNeighborCacheResolutionTimeout(t *testing.T) {
 // resolved immediately and don't send resolution requests.
 func TestNeighborCacheStaticResolution(t *testing.T) {
 	config := DefaultNUDConfigurations()
-	clock := newFakeClock()
+	clock := faketime.NewManualClock()
 	neigh := newTestNeighborCache(nil, config, clock)
 	store := newTestEntryStore()
 	linkRes := &testNeighborResolver{
@@ -1674,18 +1740,17 @@ func TestNeighborCacheStaticResolution(t *testing.T) {
 		delay:   typicalLatency,
 	}
 
-	got, _, err := neigh.entry(testEntryBroadcastAddr, testEntryLocalAddr, linkRes, nil)
+	got, _, err := neigh.entry(testEntryBroadcastAddr, "", linkRes, nil)
 	if err != nil {
-		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", testEntryBroadcastAddr, testEntryLocalAddr, err)
+		t.Fatalf("unexpected error from neigh.entry(%s, '', _, nil): %s", testEntryBroadcastAddr, err)
 	}
 	want := NeighborEntry{
-		Addr:      testEntryBroadcastAddr,
-		LocalAddr: testEntryLocalAddr,
-		LinkAddr:  testEntryBroadcastLinkAddr,
-		State:     Static,
+		Addr:     testEntryBroadcastAddr,
+		LinkAddr: testEntryBroadcastLinkAddr,
+		State:    Static,
 	}
 	if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" {
-		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", testEntryBroadcastAddr, testEntryLocalAddr, diff)
+		t.Errorf("neigh.entry(%s, '', _, nil) mismatch (-got, +want):\n%s", testEntryBroadcastAddr, diff)
 	}
 }
 
@@ -1710,9 +1775,9 @@ func BenchmarkCacheClear(b *testing.B) {
 			if !ok {
 				b.Fatalf("store.entry(%d) not found", i)
 			}
-			_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+			_, doneCh, err := neigh.entry(entry.Addr, "", linkRes, nil)
 			if err != tcpip.ErrWouldBlock {
-				b.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+				b.Fatalf("got neigh.entry(%s, '', _, nil) = %v, want = %s", entry.Addr, err, tcpip.ErrWouldBlock)
 			}
 			if doneCh != nil {
 				<-doneCh
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
index 0068cacb8..bd80f95bd 100644
--- a/pkg/tcpip/stack/neighbor_entry.go
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -21,12 +21,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // NeighborEntry describes a neighboring device in the local network.
 type NeighborEntry struct {
 	Addr      tcpip.Address
-	LocalAddr tcpip.Address
 	LinkAddr  tcpip.LinkAddress
 	State     NeighborState
 	UpdatedAt time.Time
@@ -73,8 +73,7 @@ const (
 type neighborEntry struct {
 	neighborEntryEntry
 
-	nic      *NIC
-	protocol tcpip.NetworkProtocolNumber
+	nic *NIC
 
 	// linkRes provides the functionality to send reachability probes, used in
 	// Neighbor Unreachability Detection.
@@ -106,35 +105,35 @@ type neighborEntry struct {
 // state, Unknown. Transition out of Unknown by calling either
 // `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created
 // neighborEntry.
-func newNeighborEntry(nic *NIC, remoteAddr tcpip.Address, localAddr tcpip.Address, nudState *NUDState, linkRes LinkAddressResolver) *neighborEntry {
+func newNeighborEntry(nic *NIC, remoteAddr tcpip.Address, nudState *NUDState, linkRes LinkAddressResolver) *neighborEntry {
 	return &neighborEntry{
 		nic:      nic,
 		linkRes:  linkRes,
 		nudState: nudState,
 		neigh: NeighborEntry{
-			Addr:      remoteAddr,
-			LocalAddr: localAddr,
-			State:     Unknown,
+			Addr:  remoteAddr,
+			State: Unknown,
 		},
 	}
 }
 
-// newStaticNeighborEntry creates a neighbor cache entry starting at the Static
-// state. The entry can only transition out of Static by directly calling
-// `setStateLocked`.
+// newStaticNeighborEntry creates a neighbor cache entry starting at the
+// Static state. The entry can only transition out of Static by directly
+// calling `setStateLocked`.
 func newStaticNeighborEntry(nic *NIC, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry {
+	entry := NeighborEntry{
+		Addr:      addr,
+		LinkAddr:  linkAddr,
+		State:     Static,
+		UpdatedAt: time.Now(),
+	}
 	if nic.stack.nudDisp != nil {
-		nic.stack.nudDisp.OnNeighborAdded(nic.id, addr, linkAddr, Static, time.Now())
+		nic.stack.nudDisp.OnNeighborAdded(nic.id, entry)
 	}
 	return &neighborEntry{
 		nic:      nic,
 		nudState: state,
-		neigh: NeighborEntry{
-			Addr:      addr,
-			LinkAddr:  linkAddr,
-			State:     Static,
-			UpdatedAt: time.Now(),
-		},
+		neigh:    entry,
 	}
 }
 
@@ -165,17 +164,17 @@ func (e *neighborEntry) notifyWakersLocked() {
 
 // dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has
 // been added.
-func (e *neighborEntry) dispatchAddEventLocked(nextState NeighborState) {
+func (e *neighborEntry) dispatchAddEventLocked() {
 	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
-		nudDisp.OnNeighborAdded(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now())
+		nudDisp.OnNeighborAdded(e.nic.id, e.neigh)
 	}
 }
 
 // dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry
 // has changed state or link-layer address.
-func (e *neighborEntry) dispatchChangeEventLocked(nextState NeighborState) {
+func (e *neighborEntry) dispatchChangeEventLocked() {
 	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
-		nudDisp.OnNeighborChanged(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now())
+		nudDisp.OnNeighborChanged(e.nic.id, e.neigh)
 	}
 }
 
@@ -183,7 +182,7 @@ func (e *neighborEntry) dispatchChangeEventLocked(nextState NeighborState) {
 // has been removed.
 func (e *neighborEntry) dispatchRemoveEventLocked() {
 	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
-		nudDisp.OnNeighborRemoved(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, e.neigh.State, time.Now())
+		nudDisp.OnNeighborRemoved(e.nic.id, e.neigh)
 	}
 }
 
@@ -206,63 +205,19 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 
 	switch next {
 	case Incomplete:
-		var retryCounter uint32
-		var sendMulticastProbe func()
-
-		sendMulticastProbe = func() {
-			if retryCounter == config.MaxMulticastProbes {
-				// "If no Neighbor Advertisement is received after
-				// MAX_MULTICAST_SOLICIT solicitations, address resolution has failed.
-				// The sender MUST return ICMP destination unreachable indications with
-				// code 3 (Address Unreachable) for each packet queued awaiting address
-				// resolution." - RFC 4861 section 7.2.2
-				//
-				// There is no need to send an ICMP destination unreachable indication
-				// since the failure to resolve the address is expected to only occur
-				// on this node. Thus, redirecting traffic is currently not supported.
-				//
-				// "If the error occurs on a node other than the node originating the
-				// packet, an ICMP error message is generated. If the error occurs on
-				// the originating node, an implementation is not required to actually
-				// create and send an ICMP error packet to the source, as long as the
-				// upper-layer sender is notified through an appropriate mechanism
-				// (e.g. return value from a procedure call). Note, however, that an
-				// implementation may find it convenient in some cases to return errors
-				// to the sender by taking the offending packet, generating an ICMP
-				// error message, and then delivering it (locally) through the generic
-				// error-handling routines.' - RFC 4861 section 2.1
-				e.dispatchRemoveEventLocked()
-				e.setStateLocked(Failed)
-				return
-			}
-
-			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, "", e.nic.linkEP); err != nil {
-				// There is no need to log the error here; the NUD implementation may
-				// assume a working link. A valid link should be the responsibility of
-				// the NIC/stack.LinkEndpoint.
-				e.dispatchRemoveEventLocked()
-				e.setStateLocked(Failed)
-				return
-			}
-
-			retryCounter++
-			e.job = e.nic.stack.newJob(&e.mu, sendMulticastProbe)
-			e.job.Schedule(config.RetransmitTimer)
-		}
-
-		sendMulticastProbe()
+		panic(fmt.Sprintf("should never transition to Incomplete with setStateLocked; neigh = %#v, prev state = %s", e.neigh, prev))
 
 	case Reachable:
 		e.job = e.nic.stack.newJob(&e.mu, func() {
-			e.dispatchChangeEventLocked(Stale)
 			e.setStateLocked(Stale)
+			e.dispatchChangeEventLocked()
 		})
 		e.job.Schedule(e.nudState.ReachableTime())
 
 	case Delay:
 		e.job = e.nic.stack.newJob(&e.mu, func() {
-			e.dispatchChangeEventLocked(Probe)
 			e.setStateLocked(Probe)
+			e.dispatchChangeEventLocked()
 		})
 		e.job.Schedule(config.DelayFirstProbeTime)
 
@@ -277,19 +232,13 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 				return
 			}
 
-			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, e.neigh.LinkAddr, e.nic.linkEP); err != nil {
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, "" /* localAddr */, e.neigh.LinkAddr, e.nic); err != nil {
 				e.dispatchRemoveEventLocked()
 				e.setStateLocked(Failed)
 				return
 			}
 
 			retryCounter++
-			if retryCounter == config.MaxUnicastProbes {
-				e.dispatchRemoveEventLocked()
-				e.setStateLocked(Failed)
-				return
-			}
-
 			e.job = e.nic.stack.newJob(&e.mu, sendUnicastProbe)
 			e.job.Schedule(config.RetransmitTimer)
 		}
@@ -315,15 +264,72 @@ func (e *neighborEntry) setStateLocked(next NeighborState) {
 // being queued for outgoing transmission.
 //
 // Follows the logic defined in RFC 4861 section 7.3.3.
-func (e *neighborEntry) handlePacketQueuedLocked() {
+func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) {
 	switch e.neigh.State {
 	case Unknown:
-		e.dispatchAddEventLocked(Incomplete)
-		e.setStateLocked(Incomplete)
+		e.neigh.State = Incomplete
+		e.neigh.UpdatedAt = time.Now()
+
+		e.dispatchAddEventLocked()
+
+		config := e.nudState.Config()
+
+		var retryCounter uint32
+		var sendMulticastProbe func()
+
+		sendMulticastProbe = func() {
+			if retryCounter == config.MaxMulticastProbes {
+				// "If no Neighbor Advertisement is received after
+				// MAX_MULTICAST_SOLICIT solicitations, address resolution has failed.
+				// The sender MUST return ICMP destination unreachable indications with
+				// code 3 (Address Unreachable) for each packet queued awaiting address
+				// resolution." - RFC 4861 section 7.2.2
+				//
+				// There is no need to send an ICMP destination unreachable indication
+				// since the failure to resolve the address is expected to only occur
+				// on this node. Thus, redirecting traffic is currently not supported.
+				//
+				// "If the error occurs on a node other than the node originating the
+				// packet, an ICMP error message is generated. If the error occurs on
+				// the originating node, an implementation is not required to actually
+				// create and send an ICMP error packet to the source, as long as the
+				// upper-layer sender is notified through an appropriate mechanism
+				// (e.g. return value from a procedure call). Note, however, that an
+				// implementation may find it convenient in some cases to return errors
+				// to the sender by taking the offending packet, generating an ICMP
+				// error message, and then delivering it (locally) through the generic
+				// error-handling routines.' - RFC 4861 section 2.1
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			// As per RFC 4861 section 7.2.2:
+			//
+			//  If the source address of the packet prompting the solicitation is the
+			//  same as one of the addresses assigned to the outgoing interface, that
+			//  address SHOULD be placed in the IP Source Address of the outgoing
+			//  solicitation.
+			//
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, localAddr, "", e.nic); err != nil {
+				// There is no need to log the error here; the NUD implementation may
+				// assume a working link. A valid link should be the responsibility of
+				// the NIC/stack.LinkEndpoint.
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			retryCounter++
+			e.job = e.nic.stack.newJob(&e.mu, sendMulticastProbe)
+			e.job.Schedule(config.RetransmitTimer)
+		}
+
+		sendMulticastProbe()
 
 	case Stale:
-		e.dispatchChangeEventLocked(Delay)
 		e.setStateLocked(Delay)
+		e.dispatchChangeEventLocked()
 
 	case Incomplete, Reachable, Delay, Probe, Static, Failed:
 		// Do nothing
@@ -345,21 +351,21 @@ func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) {
 	switch e.neigh.State {
 	case Unknown, Incomplete, Failed:
 		e.neigh.LinkAddr = remoteLinkAddr
-		e.dispatchAddEventLocked(Stale)
 		e.setStateLocked(Stale)
 		e.notifyWakersLocked()
+		e.dispatchAddEventLocked()
 
 	case Reachable, Delay, Probe:
 		if e.neigh.LinkAddr != remoteLinkAddr {
 			e.neigh.LinkAddr = remoteLinkAddr
-			e.dispatchChangeEventLocked(Stale)
 			e.setStateLocked(Stale)
+			e.dispatchChangeEventLocked()
 		}
 
 	case Stale:
 		if e.neigh.LinkAddr != remoteLinkAddr {
 			e.neigh.LinkAddr = remoteLinkAddr
-			e.dispatchChangeEventLocked(Stale)
+			e.dispatchChangeEventLocked()
 		}
 
 	case Static:
@@ -393,12 +399,11 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 
 		e.neigh.LinkAddr = linkAddr
 		if flags.Solicited {
-			e.dispatchChangeEventLocked(Reachable)
 			e.setStateLocked(Reachable)
 		} else {
-			e.dispatchChangeEventLocked(Stale)
 			e.setStateLocked(Stale)
 		}
+		e.dispatchChangeEventLocked()
 		e.isRouter = flags.IsRouter
 		e.notifyWakersLocked()
 
@@ -406,13 +411,13 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 		// INCOMPLETE state." - RFC 4861 section 7.2.5
 
 	case Reachable, Stale, Delay, Probe:
-		sameLinkAddr := e.neigh.LinkAddr == linkAddr
+		isLinkAddrDifferent := len(linkAddr) != 0 && e.neigh.LinkAddr != linkAddr
 
-		if !sameLinkAddr {
+		if isLinkAddrDifferent {
 			if !flags.Override {
 				if e.neigh.State == Reachable {
-					e.dispatchChangeEventLocked(Stale)
 					e.setStateLocked(Stale)
+					e.dispatchChangeEventLocked()
 				}
 				break
 			}
@@ -421,26 +426,27 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 
 			if !flags.Solicited {
 				if e.neigh.State != Stale {
-					e.dispatchChangeEventLocked(Stale)
 					e.setStateLocked(Stale)
+					e.dispatchChangeEventLocked()
 				} else {
 					// Notify the LinkAddr change, even though NUD state hasn't changed.
-					e.dispatchChangeEventLocked(e.neigh.State)
+					e.dispatchChangeEventLocked()
 				}
 				break
 			}
 		}
 
-		if flags.Solicited && (flags.Override || sameLinkAddr) {
-			if e.neigh.State != Reachable {
-				e.dispatchChangeEventLocked(Reachable)
-			}
+		if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
+			wasReachable := e.neigh.State == Reachable
 			// Set state to Reachable again to refresh timers.
 			e.setStateLocked(Reachable)
 			e.notifyWakersLocked()
+			if !wasReachable {
+				e.dispatchChangeEventLocked()
+			}
 		}
 
-		if e.isRouter && !flags.IsRouter {
+		if e.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.neigh.Addr) {
 			// "In those cases where the IsRouter flag changes from TRUE to FALSE as
 			// a result of this update, the node MUST remove that router from the
 			// Default Router List and update the Destination Cache entries for all
@@ -448,9 +454,17 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 			// 7.3.3.  This is needed to detect when a node that is used as a router
 			// stops forwarding packets due to being configured as a host."
 			//  - RFC 4861 section 7.2.5
-			e.nic.mu.Lock()
-			e.nic.mu.ndp.invalidateDefaultRouter(e.neigh.Addr)
-			e.nic.mu.Unlock()
+			//
+			// TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6
+			// here.
+			ep, ok := e.nic.networkEndpoints[header.IPv6ProtocolNumber]
+			if !ok {
+				panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint"))
+			}
+
+			if ndpEP, ok := ep.(NDPEndpoint); ok {
+				ndpEP.InvalidateDefaultRouter(e.neigh.Addr)
+			}
 		}
 		e.isRouter = flags.IsRouter
 
@@ -467,11 +481,12 @@ func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, fla
 func (e *neighborEntry) handleUpperLevelConfirmationLocked() {
 	switch e.neigh.State {
 	case Reachable, Stale, Delay, Probe:
-		if e.neigh.State != Reachable {
-			e.dispatchChangeEventLocked(Reachable)
-			// Set state to Reachable again to refresh timers.
-		}
+		wasReachable := e.neigh.State == Reachable
+		// Set state to Reachable again to refresh timers.
 		e.setStateLocked(Reachable)
+		if !wasReachable {
+			e.dispatchChangeEventLocked()
+		}
 
 	case Unknown, Incomplete, Failed, Static:
 		// Do nothing
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index b769fb2fa..e8e0e571b 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -27,6 +27,8 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/faketime"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 const (
@@ -50,19 +52,16 @@ const (
 // predict the time that an event will be dispatched.
 func eventDiffOpts() []cmp.Option {
 	return []cmp.Option{
-		cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"),
+		cmpopts.IgnoreFields(NeighborEntry{}, "UpdatedAt"),
 	}
 }
 
 // eventDiffOptsWithSort is like eventDiffOpts but also includes an option to
 // sort slices of events for cases where ordering must be ignored.
 func eventDiffOptsWithSort() []cmp.Option {
-	return []cmp.Option{
-		cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"),
-		cmpopts.SortSlices(func(a, b testEntryEventInfo) bool {
-			return strings.Compare(string(a.Addr), string(b.Addr)) < 0
-		}),
-	}
+	return append(eventDiffOpts(), cmpopts.SortSlices(func(a, b testEntryEventInfo) bool {
+		return strings.Compare(string(a.Entry.Addr), string(b.Entry.Addr)) < 0
+	}))
 }
 
 // The following unit tests exercise every state transition and verify its
@@ -81,15 +80,18 @@ func eventDiffOptsWithSort() []cmp.Option {
 // | Reachable  | Stale      | Reachable timer expired                    |                 | Changed |
 // | Reachable  | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Stale      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Stale      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Stale      | Stale      | Override confirmation                      | Update LinkAddr | Changed |
 // | Stale      | Stale      | Probe w/ different address                 | Update LinkAddr | Changed |
 // | Stale      | Delay      | Packet sent                                |                 | Changed |
 // | Delay      | Reachable  | Upper-layer confirmation                   |                 | Changed |
 // | Delay      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Delay      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Delay      | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Delay      | Probe      | Delay timer expired                        | Send probe      | Changed |
 // | Probe      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
 // | Probe      | Reachable  | Solicited confirmation w/ same address     | Notify wakers   | Changed |
+// | Probe      | Reachable  | Solicited confirmation w/o address         | Notify wakers   | Changed |
 // | Probe      | Stale      | Probe or confirmation w/ different address |                 | Changed |
 // | Probe      | Probe      | Retransmit timer expired                   | Send probe      | Changed |
 // | Probe      | Failed     | Max probes sent without reply              | Notify wakers   | Removed |
@@ -120,14 +122,11 @@ func (t testEntryEventType) String() string {
 type testEntryEventInfo struct {
 	EventType testEntryEventType
 	NICID     tcpip.NICID
-	Addr      tcpip.Address
-	LinkAddr  tcpip.LinkAddress
-	State     NeighborState
-	UpdatedAt time.Time
+	Entry     NeighborEntry
 }
 
 func (e testEntryEventInfo) String() string {
-	return fmt.Sprintf("%s event for NIC #%d, addr=%q, linkAddr=%q, state=%q", e.EventType, e.NICID, e.Addr, e.LinkAddr, e.State)
+	return fmt.Sprintf("%s event for NIC #%d, %#v", e.EventType, e.NICID, e.Entry)
 }
 
 // testNUDDispatcher implements NUDDispatcher to validate the dispatching of
@@ -145,36 +144,27 @@ func (d *testNUDDispatcher) queueEvent(e testEntryEventInfo) {
 	d.events = append(d.events, e)
 }
 
-func (d *testNUDDispatcher) OnNeighborAdded(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+func (d *testNUDDispatcher) OnNeighborAdded(nicID tcpip.NICID, entry NeighborEntry) {
 	d.queueEvent(testEntryEventInfo{
 		EventType: entryTestAdded,
 		NICID:     nicID,
-		Addr:      addr,
-		LinkAddr:  linkAddr,
-		State:     state,
-		UpdatedAt: updatedAt,
+		Entry:     entry,
 	})
 }
 
-func (d *testNUDDispatcher) OnNeighborChanged(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+func (d *testNUDDispatcher) OnNeighborChanged(nicID tcpip.NICID, entry NeighborEntry) {
 	d.queueEvent(testEntryEventInfo{
 		EventType: entryTestChanged,
 		NICID:     nicID,
-		Addr:      addr,
-		LinkAddr:  linkAddr,
-		State:     state,
-		UpdatedAt: updatedAt,
+		Entry:     entry,
 	})
 }
 
-func (d *testNUDDispatcher) OnNeighborRemoved(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+func (d *testNUDDispatcher) OnNeighborRemoved(nicID tcpip.NICID, entry NeighborEntry) {
 	d.queueEvent(testEntryEventInfo{
 		EventType: entryTestRemoved,
 		NICID:     nicID,
-		Addr:      addr,
-		LinkAddr:  linkAddr,
-		State:     state,
-		UpdatedAt: updatedAt,
+		Entry:     entry,
 	})
 }
 
@@ -197,9 +187,9 @@ func (p entryTestProbeInfo) String() string {
 
 // LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts
 // to the local network if linkAddr is the zero value.
-func (r *entryTestLinkResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
+func (r *entryTestLinkResolver) LinkAddressRequest(targetAddr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, _ NetworkInterface) *tcpip.Error {
 	p := entryTestProbeInfo{
-		RemoteAddress:     addr,
+		RemoteAddress:     targetAddr,
 		RemoteLinkAddress: linkAddr,
 		LocalAddress:      localAddr,
 	}
@@ -221,28 +211,26 @@ func (r *entryTestLinkResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumbe
 	return entryTestNetNumber
 }
 
-func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *fakeClock) {
-	clock := newFakeClock()
+func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *faketime.ManualClock) {
+	clock := faketime.NewManualClock()
 	disp := testNUDDispatcher{}
 	nic := NIC{
-		id:     entryTestNICID,
-		linkEP: nil, // entryTestLinkResolver doesn't use a LinkEndpoint
+		LinkEndpoint: nil, // entryTestLinkResolver doesn't use a LinkEndpoint
+
+		id: entryTestNICID,
 		stack: &Stack{
 			clock:   clock,
 			nudDisp: &disp,
 		},
 	}
+	nic.networkEndpoints = map[tcpip.NetworkProtocolNumber]NetworkEndpoint{
+		header.IPv6ProtocolNumber: (&testIPv6Protocol{}).NewEndpoint(&nic, nil, nil, nil),
+	}
 
 	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
 	nudState := NewNUDState(c, rng)
 	linkRes := entryTestLinkResolver{}
-	entry := newNeighborEntry(&nic, entryTestAddr1 /* remoteAddr */, entryTestAddr2 /* localAddr */, nudState, &linkRes)
-
-	// Stub out ndpState to verify modification of default routers.
-	nic.mu.ndp = ndpState{
-		nic:            &nic,
-		defaultRouters: make(map[tcpip.Address]defaultRouterState),
-	}
+	entry := newNeighborEntry(&nic, entryTestAddr1 /* remoteAddr */, nudState, &linkRes)
 
 	// Stub out the neighbor cache to verify deletion from the cache.
 	nic.neigh = &neighborCache{
@@ -267,7 +255,7 @@ func TestEntryInitiallyUnknown(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// No probes should have been sent.
 	linkRes.mu.Lock()
@@ -300,7 +288,7 @@ func TestEntryUnknownToUnknownWhenConfirmationWithUnknownAddress(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(time.Hour)
+	clock.Advance(time.Hour)
 
 	// No probes should have been sent.
 	linkRes.mu.Lock()
@@ -323,7 +311,7 @@ func TestEntryUnknownToIncomplete(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -347,9 +335,11 @@ func TestEntryUnknownToIncomplete(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 	}
 	{
@@ -385,9 +375,11 @@ func TestEntryUnknownToStale(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -403,14 +395,14 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
 	updatedAt := e.neigh.UpdatedAt
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// UpdatedAt should remain the same during address resolution.
 	wantProbes := []entryTestProbeInfo{
@@ -439,7 +431,7 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	// UpdatedAt should change after failing address resolution. Timing out after
 	// sending the last probe transitions the entry to Failed.
@@ -459,22 +451,26 @@ func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
 		}
 	}
 
-	clock.advance(c.RetransmitTimer)
+	clock.Advance(c.RetransmitTimer)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestRemoved,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -495,7 +491,7 @@ func TestEntryIncompleteToReachable(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -527,16 +523,20 @@ func TestEntryIncompleteToReachable(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -560,7 +560,7 @@ func TestEntryAddsAndClearsWakers(t *testing.T) {
 	defer s.Done()
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got := e.wakers; got != nil {
 		t.Errorf("got e.wakers = %v, want = nil", got)
 	}
@@ -602,16 +602,20 @@ func TestEntryAddsAndClearsWakers(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -626,7 +630,7 @@ func TestEntryIncompleteToReachableWithRouterFlag(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -660,16 +664,20 @@ func TestEntryIncompleteToReachableWithRouterFlag(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -684,7 +692,7 @@ func TestEntryIncompleteToStale(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -716,16 +724,20 @@ func TestEntryIncompleteToStale(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -741,14 +753,14 @@ func TestEntryIncompleteToFailed(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Incomplete; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
 	e.mu.Unlock()
 
 	waitFor := c.RetransmitTimer * time.Duration(c.MaxMulticastProbes)
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 
 	wantProbes := []entryTestProbeInfo{
 		// The Incomplete-to-Incomplete state transition is tested here by
@@ -780,16 +792,20 @@ func TestEntryIncompleteToFailed(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestRemoved,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -816,8 +832,10 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	c := DefaultNUDConfigurations()
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
+	ipv6EP := e.nic.networkEndpoints[header.IPv6ProtocolNumber].(*testIPv6Endpoint)
+
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -829,9 +847,7 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	if got, want := e.isRouter, true; got != want {
 		t.Errorf("got e.isRouter = %t, want = %t", got, want)
 	}
-	e.nic.mu.ndp.defaultRouters[entryTestAddr1] = defaultRouterState{
-		invalidationJob: e.nic.stack.newJob(&testLocker{}, func() {}),
-	}
+
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -840,8 +856,8 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 	if got, want := e.isRouter, false; got != want {
 		t.Errorf("got e.isRouter = %t, want = %t", got, want)
 	}
-	if _, ok := e.nic.mu.ndp.defaultRouters[entryTestAddr1]; ok {
-		t.Errorf("unexpected defaultRouter for %s", entryTestAddr1)
+	if ipv6EP.invalidatedRtr != e.neigh.Addr {
+		t.Errorf("got ipv6EP.invalidatedRtr = %s, want = %s", ipv6EP.invalidatedRtr, e.neigh.Addr)
 	}
 	e.mu.Unlock()
 
@@ -863,16 +879,20 @@ func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -893,7 +913,7 @@ func TestEntryStaysReachableWhenProbeWithSameAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -929,16 +949,20 @@ func TestEntryStaysReachableWhenProbeWithSameAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -958,7 +982,7 @@ func TestEntryReachableToStaleWhenTimeout(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -983,29 +1007,35 @@ func TestEntryReachableToStaleWhenTimeout(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1026,7 +1056,7 @@ func TestEntryReachableToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -1059,23 +1089,29 @@ func TestEntryReachableToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1096,7 +1132,7 @@ func TestEntryReachableToStaleWhenConfirmationWithDifferentAddress(t *testing.T)
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -1133,23 +1169,29 @@ func TestEntryReachableToStaleWhenConfirmationWithDifferentAddress(t *testing.T)
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1170,7 +1212,7 @@ func TestEntryReachableToStaleWhenConfirmationWithDifferentAddressAndOverride(t
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: true,
 		Override:  false,
@@ -1207,23 +1249,29 @@ func TestEntryReachableToStaleWhenConfirmationWithDifferentAddressAndOverride(t
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1244,7 +1292,7 @@ func TestEntryStaysStaleWhenProbeWithSameAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -1280,16 +1328,20 @@ func TestEntryStaysStaleWhenProbeWithSameAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1304,7 +1356,7 @@ func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -1344,23 +1396,106 @@ func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Reachable,
+			},
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaleToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Stale {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Stale)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	if e.neigh.LinkAddr != entryTestLinkAddr1 {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1375,7 +1510,7 @@ func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -1415,23 +1550,29 @@ func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1446,7 +1587,7 @@ func TestEntryStaleToStaleWhenProbeUpdateAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -1482,23 +1623,29 @@ func TestEntryStaleToStaleWhenProbeUpdateAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1513,7 +1660,7 @@ func TestEntryStaleToDelay(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
@@ -1522,7 +1669,7 @@ func TestEntryStaleToDelay(t *testing.T) {
 	if got, want := e.neigh.State, Stale; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1546,23 +1693,29 @@ func TestEntryStaleToDelay(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1582,13 +1735,13 @@ func TestEntryDelayToReachableWhenUpperLevelConfirmation(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1612,43 +1765,53 @@ func TestEntryDelayToReachableWhenUpperLevelConfirmation(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1669,13 +1832,13 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1706,43 +1869,157 @@ func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Reachable,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 1
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	if e.neigh.State != Delay {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Delay)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	if e.neigh.LinkAddr != entryTestLinkAddr1 {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", e.neigh.LinkAddr, entryTestLinkAddr1)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1757,13 +2034,13 @@ func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1798,23 +2075,29 @@ func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1829,13 +2112,13 @@ func TestEntryDelayToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1863,30 +2146,38 @@ func TestEntryDelayToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1901,13 +2192,13 @@ func TestEntryDelayToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 	e, nudDisp, linkRes, _ := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
@@ -1939,30 +2230,38 @@ func TestEntryDelayToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -1977,19 +2276,19 @@ func TestEntryDelayToProbe(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	if got, want := e.neigh.State, Delay; got != want {
 		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2002,7 +2301,6 @@ func TestEntryDelayToProbe(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2016,30 +2314,38 @@ func TestEntryDelayToProbe(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2060,16 +2366,16 @@ func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2082,7 +2388,6 @@ func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2106,37 +2411,47 @@ func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2157,16 +2472,16 @@ func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2179,7 +2494,6 @@ func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2207,37 +2521,47 @@ func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2258,16 +2582,16 @@ func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2280,7 +2604,6 @@ func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2311,30 +2634,38 @@ func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2361,17 +2692,16 @@ func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) {
 
 	e.mu.Lock()
 	e.handleProbeLocked(entryTestLinkAddr1)
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// Probe caused by the Delay-to-Probe transition
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2398,43 +2728,53 @@ func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2454,16 +2794,16 @@ func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2476,7 +2816,6 @@ func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2503,50 +2842,62 @@ func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr2,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr2,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2566,16 +2917,16 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	clock.advance(c.DelayFirstProbeTime)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2588,7 +2939,6 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2612,50 +2962,62 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 	}
 	e.mu.Unlock()
 
-	clock.advance(c.BaseReachableTime)
+	clock.Advance(c.BaseReachableTime)
 
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Reachable,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2665,24 +3027,26 @@ func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testin
 	nudDisp.mu.Unlock()
 }
 
-func TestEntryProbeToFailed(t *testing.T) {
+func TestEntryProbeToReachableWhenSolicitedConfirmationWithoutAddress(t *testing.T) {
 	c := DefaultNUDConfigurations()
-	c.MaxMulticastProbes = 3
-	c.MaxUnicastProbes = 3
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
 	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
-	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes)
-	clock.advance(waitFor)
+	clock.Advance(c.DelayFirstProbeTime)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2691,21 +3055,10 @@ func TestEntryProbeToFailed(t *testing.T) {
 			RemoteLinkAddress: tcpip.LinkAddress(""),
 			LocalAddress:      entryTestAddr2,
 		},
-		// The next three probe are caused by the Delay-to-Probe transition.
-		{
-			RemoteAddress:     entryTestAddr1,
-			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
-		},
-		{
-			RemoteAddress:     entryTestAddr1,
-			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
-		},
+		// The second probe is caused by the Delay-to-Probe transition.
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2715,41 +3068,76 @@ func TestEntryProbeToFailed(t *testing.T) {
 		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
 	}
 
+	e.mu.Lock()
+	if e.neigh.State != Probe {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Probe)
+	}
+	e.handleConfirmationLocked("" /* linkAddr */, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.neigh.State != Reachable {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Reachable)
+	}
+	e.mu.Unlock()
+
+	clock.Advance(c.BaseReachableTime)
+
 	wantEvents := []testEntryEventInfo{
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
-			EventType: entryTestRemoved,
+			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Reachable,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
@@ -2757,12 +3145,129 @@ func TestEntryProbeToFailed(t *testing.T) {
 		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
 	}
 	nudDisp.mu.Unlock()
+}
+
+func TestEntryProbeToFailed(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	c.MaxUnicastProbes = 3
+	c.DelayFirstProbeTime = c.RetransmitTimer
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
 
 	e.mu.Lock()
-	if got, want := e.neigh.State, Failed; got != want {
-		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.mu.Unlock()
+
+	{
+		wantProbes := []entryTestProbeInfo{
+			// Caused by the Unknown-to-Incomplete transition.
+			{
+				RemoteAddress: entryTestAddr1,
+				LocalAddress:  entryTestAddr2,
+			},
+		}
+		linkRes.mu.Lock()
+		diff := cmp.Diff(linkRes.probes, wantProbes)
+		linkRes.probes = nil
+		linkRes.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	e.mu.Lock()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked(entryTestAddr2)
+	e.mu.Unlock()
+
+	// Observe each probe sent while in the Probe state.
+	for i := uint32(0); i < c.MaxUnicastProbes; i++ {
+		clock.Advance(c.RetransmitTimer)
+		wantProbes := []entryTestProbeInfo{
+			{
+				RemoteAddress:     entryTestAddr1,
+				RemoteLinkAddress: entryTestLinkAddr1,
+			},
+		}
+		linkRes.mu.Lock()
+		diff := cmp.Diff(linkRes.probes, wantProbes)
+		linkRes.probes = nil
+		linkRes.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("link address resolver probe #%d mismatch (-got, +want):\n%s", i+1, diff)
+		}
+
+		e.mu.Lock()
+		if e.neigh.State != Probe {
+			t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Probe)
+		}
+		e.mu.Unlock()
+	}
+
+	// Wait for the last probe to expire, causing a transition to Failed.
+	clock.Advance(c.RetransmitTimer)
+	e.mu.Lock()
+	if e.neigh.State != Failed {
+		t.Errorf("got e.neigh.State = %q, want = %q", e.neigh.State, Failed)
 	}
 	e.mu.Unlock()
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
 }
 
 func TestEntryFailedGetsDeleted(t *testing.T) {
@@ -2777,17 +3282,17 @@ func TestEntryFailedGetsDeleted(t *testing.T) {
 	}
 
 	e.mu.Lock()
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
 		Solicited: false,
 		Override:  false,
 		IsRouter:  false,
 	})
-	e.handlePacketQueuedLocked()
+	e.handlePacketQueuedLocked(entryTestAddr2)
 	e.mu.Unlock()
 
 	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + c.UnreachableTime
-	clock.advance(waitFor)
+	clock.Advance(waitFor)
 
 	wantProbes := []entryTestProbeInfo{
 		// The first probe is caused by the Unknown-to-Incomplete transition.
@@ -2800,17 +3305,14 @@ func TestEntryFailedGetsDeleted(t *testing.T) {
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 		{
 			RemoteAddress:     entryTestAddr1,
 			RemoteLinkAddress: entryTestLinkAddr1,
-			LocalAddress:      entryTestAddr2,
 		},
 	}
 	linkRes.mu.Lock()
@@ -2824,37 +3326,47 @@ func TestEntryFailedGetsDeleted(t *testing.T) {
 		{
 			EventType: entryTestAdded,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  tcpip.LinkAddress(""),
-			State:     Incomplete,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: tcpip.LinkAddress(""),
+				State:    Incomplete,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Stale,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Stale,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Delay,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Delay,
+			},
 		},
 		{
 			EventType: entryTestChanged,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 		{
 			EventType: entryTestRemoved,
 			NICID:     entryTestNICID,
-			Addr:      entryTestAddr1,
-			LinkAddr:  entryTestLinkAddr1,
-			State:     Probe,
+			Entry: NeighborEntry{
+				Addr:     entryTestAddr1,
+				LinkAddr: entryTestLinkAddr1,
+				State:    Probe,
+			},
 		},
 	}
 	nudDisp.mu.Lock()
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 728292782..b97f4380a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -18,48 +18,45 @@ import (
 	"fmt"
 	"math/rand"
 	"reflect"
-	"sort"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-var ipv4BroadcastAddr = tcpip.ProtocolAddress{
-	Protocol: header.IPv4ProtocolNumber,
-	AddressWithPrefix: tcpip.AddressWithPrefix{
-		Address:   header.IPv4Broadcast,
-		PrefixLen: 8 * header.IPv4AddressSize,
-	},
-}
+var _ NetworkInterface = (*NIC)(nil)
 
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
+	LinkEndpoint
+
 	stack   *Stack
 	id      tcpip.NICID
 	name    string
-	linkEP  LinkEndpoint
 	context NICContext
 
-	stats            NICStats
-	neigh            *neighborCache
+	stats NICStats
+	neigh *neighborCache
+
+	// The network endpoints themselves may be modified by calling the interface's
+	// methods, but the map reference and entries must be constant.
 	networkEndpoints map[tcpip.NetworkProtocolNumber]NetworkEndpoint
 
+	// enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
+	//
+	// Must be accessed using atomic operations.
+	enabled uint32
+
 	mu struct {
 		sync.RWMutex
-		enabled     bool
 		spoofing    bool
 		promiscuous bool
-		primary     map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
-		endpoints   map[NetworkEndpointID]*referencedNetworkEndpoint
-		mcastJoins  map[NetworkEndpointID]uint32
 		// packetEPs is protected by mu, but the contained PacketEndpoint
 		// values are not.
 		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
-		ndp       ndpState
 	}
 }
 
@@ -83,25 +80,6 @@ type DirectionStats struct {
 	Bytes   *tcpip.StatCounter
 }
 
-// PrimaryEndpointBehavior is an enumeration of an endpoint's primacy behavior.
-type PrimaryEndpointBehavior int
-
-const (
-	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
-	// endpoint for new connections with no local address. This is the
-	// default when calling NIC.AddAddress.
-	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
-
-	// FirstPrimaryEndpoint indicates the endpoint should be the first
-	// primary endpoint considered. If there are multiple endpoints with
-	// this behavior, the most recently-added one will be first.
-	FirstPrimaryEndpoint
-
-	// NeverPrimaryEndpoint indicates the endpoint should never be a
-	// primary endpoint.
-	NeverPrimaryEndpoint
-)
-
 // newNIC returns a new NIC using the default NDP configurations from stack.
 func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC {
 	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
@@ -113,76 +91,77 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	// of IPv6 is supported on this endpoint's LinkEndpoint.
 
 	nic := &NIC{
+		LinkEndpoint: ep,
+
 		stack:            stack,
 		id:               id,
 		name:             name,
-		linkEP:           ep,
 		context:          ctx,
 		stats:            makeNICStats(),
 		networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint),
 	}
-	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
-	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
-	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
 	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
-	nic.mu.ndp = ndpState{
-		nic:            nic,
-		configs:        stack.ndpConfigs,
-		dad:            make(map[tcpip.Address]dadState),
-		defaultRouters: make(map[tcpip.Address]defaultRouterState),
-		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
-		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
-	}
-	nic.mu.ndp.initializeTempAddrState()
-
-	// Register supported packet endpoint protocols.
-	for _, netProto := range header.Ethertypes {
-		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
-	}
-	for _, netProto := range stack.networkProtocols {
-		netNum := netProto.Number()
-		nic.mu.packetEPs[netNum] = nil
-		nic.networkEndpoints[netNum] = netProto.NewEndpoint(id, stack, nic, ep, stack)
-	}
 
 	// Check for Neighbor Unreachability Detection support.
-	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 {
+	var nud NUDHandler
+	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 && stack.useNeighborCache {
 		rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds()))
 		nic.neigh = &neighborCache{
 			nic:   nic,
 			state: NewNUDState(stack.nudConfigs, rng),
 			cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
 		}
+
+		// An interface value that holds a nil pointer but non-nil type is not the
+		// same as the nil interface. Because of this, nud must only be assignd if
+		// nic.neigh is non-nil since a nil reference to a neighborCache is not
+		// valid.
+		//
+		// See https://golang.org/doc/faq#nil_error for more information.
+		nud = nic.neigh
 	}
 
-	nic.linkEP.Attach(nic)
+	// Register supported packet and network endpoint protocols.
+	for _, netProto := range header.Ethertypes {
+		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
+	}
+	for _, netProto := range stack.networkProtocols {
+		netNum := netProto.Number()
+		nic.mu.packetEPs[netNum] = nil
+		nic.networkEndpoints[netNum] = netProto.NewEndpoint(nic, stack, nud, nic)
+	}
+
+	nic.LinkEndpoint.Attach(nic)
 
 	return nic
 }
 
-// enabled returns true if n is enabled.
-func (n *NIC) enabled() bool {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	return enabled
+func (n *NIC) getNetworkEndpoint(proto tcpip.NetworkProtocolNumber) NetworkEndpoint {
+	return n.networkEndpoints[proto]
 }
 
-// disable disables n.
+// Enabled implements NetworkInterface.
+func (n *NIC) Enabled() bool {
+	return atomic.LoadUint32(&n.enabled) == 1
+}
+
+// setEnabled sets the enabled status for the NIC.
 //
-// It undoes the work done by enable.
-func (n *NIC) disable() *tcpip.Error {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	if !enabled {
-		return nil
+// Returns true if the enabled status was updated.
+func (n *NIC) setEnabled(v bool) bool {
+	if v {
+		return atomic.SwapUint32(&n.enabled, 1) == 0
 	}
+	return atomic.SwapUint32(&n.enabled, 0) == 1
+}
 
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() {
 	n.mu.Lock()
-	err := n.disableLocked()
+	n.disableLocked()
 	n.mu.Unlock()
-	return err
 }
 
 // disableLocked disables n.
@@ -190,9 +169,9 @@ func (n *NIC) disable() *tcpip.Error {
 // It undoes the work done by enable.
 //
 // n MUST be locked.
-func (n *NIC) disableLocked() *tcpip.Error {
-	if !n.mu.enabled {
-		return nil
+func (n *NIC) disableLocked() {
+	if !n.setEnabled(false) {
+		return
 	}
 
 	// TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be
@@ -200,38 +179,9 @@ func (n *NIC) disableLocked() *tcpip.Error {
 	// again, and applications may not know that the underlying NIC was ever
 	// disabled.
 
-	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
-		n.mu.ndp.stopSolicitingRouters()
-		n.mu.ndp.cleanupState(false /* hostOnly */)
-
-		// Stop DAD for all the unicast IPv6 endpoints that are in the
-		// permanentTentative state.
-		for _, r := range n.mu.endpoints {
-			if addr := r.address(); r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
-				n.mu.ndp.stopDuplicateAddressDetection(addr)
-			}
-		}
-
-		// The NIC may have already left the multicast group.
-		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
-	}
-
-	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		// The NIC may have already left the multicast group.
-		if err := n.leaveGroupLocked(header.IPv4AllSystems, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
-
-		// The address may have already been removed.
-		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
+	for _, ep := range n.networkEndpoints {
+		ep.Disable()
 	}
-
-	n.mu.enabled = false
-	return nil
 }
 
 // enable enables n.
@@ -241,162 +191,38 @@ func (n *NIC) disableLocked() *tcpip.Error {
 // routers if the stack is not operating as a router. If the stack is also
 // configured to auto-generate a link-local address, one will be generated.
 func (n *NIC) enable() *tcpip.Error {
-	n.mu.RLock()
-	enabled := n.mu.enabled
-	n.mu.RUnlock()
-	if enabled {
-		return nil
-	}
-
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	if n.mu.enabled {
+	if !n.setEnabled(true) {
 		return nil
 	}
 
-	n.mu.enabled = true
-
-	// Create an endpoint to receive broadcast packets on this interface.
-	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
-
-		// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
-		// multicast group. Note, the IANA calls the all-hosts multicast group the
-		// all-systems multicast group.
-		if err := n.joinGroupLocked(header.IPv4ProtocolNumber, header.IPv4AllSystems); err != nil {
-			return err
-		}
-	}
-
-	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
-	// use IPv6. This is required to ensure that this node properly receives
-	// and responds to the various NDP messages that are destined to the
-	// all-nodes multicast address. An example is the Neighbor Advertisement
-	// when we perform Duplicate Address Detection, or Router Advertisement
-	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
-	// section 4.2 for more information.
-	//
-	// Also auto-generate an IPv6 link-local address based on the NIC's
-	// link address if it is configured to do so. Note, each interface is
-	// required to have IPv6 link-local unicast address, as per RFC 4291
-	// section 2.1.
-	_, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]
-	if !ok {
-		return nil
-	}
-
-	// Join the All-Nodes multicast group before starting DAD as responses to DAD
-	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
-	// source address of the NDP NS is the unspecified address, as per RFC 4861
-	// section 7.2.4.
-	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
-	}
-
-	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
-	// state.
-	//
-	// Addresses may have aleady completed DAD but in the time since the NIC was
-	// last enabled, other devices may have acquired the same addresses.
-	for _, r := range n.mu.endpoints {
-		addr := r.address()
-		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
-			continue
-		}
-
-		r.setKind(permanentTentative)
-		if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil {
+	for _, ep := range n.networkEndpoints {
+		if err := ep.Enable(); err != nil {
 			return err
 		}
 	}
 
-	// Do not auto-generate an IPv6 link-local address for loopback devices.
-	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
-		// The valid and preferred lifetime is infinite for the auto-generated
-		// link-local address.
-		n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
-	}
-
-	// If we are operating as a router, then do not solicit routers since we
-	// won't process the RAs anyways.
-	//
-	// Routers do not process Router Advertisements (RA) the same way a host
-	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
-	// and default routers). Therefore, soliciting RAs from other routers on
-	// a link is unnecessary for routers.
-	if !n.stack.forwarding {
-		n.mu.ndp.startSolicitingRouters()
-	}
-
 	return nil
 }
 
-// remove detaches NIC from the link endpoint, and marks existing referenced
-// network endpoints expired. This guarantees no packets between this NIC and
-// the network stack.
+// remove detaches NIC from the link endpoint and releases network endpoint
+// resources. This guarantees no packets between this NIC and the network
+// stack.
 func (n *NIC) remove() *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
 	n.disableLocked()
 
-	// TODO(b/151378115): come up with a better way to pick an error than the
-	// first one.
-	var err *tcpip.Error
-
-	// Forcefully leave multicast groups.
-	for nid := range n.mu.mcastJoins {
-		if tempErr := n.leaveGroupLocked(nid.LocalAddress, true /* force */); tempErr != nil && err == nil {
-			err = tempErr
-		}
-	}
-
-	// Remove permanent and permanentTentative addresses, so no packet goes out.
-	for nid, ref := range n.mu.endpoints {
-		switch ref.getKind() {
-		case permanentTentative, permanent:
-			if tempErr := n.removePermanentAddressLocked(nid.LocalAddress); tempErr != nil && err == nil {
-				err = tempErr
-			}
-		}
-	}
-
-	// Release any resources the network endpoint may hold.
 	for _, ep := range n.networkEndpoints {
 		ep.Close()
 	}
 
 	// Detach from link endpoint, so no packet comes in.
-	n.linkEP.Attach(nil)
-
-	return err
-}
-
-// becomeIPv6Router transitions n into an IPv6 router.
-//
-// When transitioning into an IPv6 router, host-only state (NDP discovered
-// routers, discovered on-link prefixes, and auto-generated addresses) will
-// be cleaned up/invalidated and NDP router solicitations will be stopped.
-func (n *NIC) becomeIPv6Router() {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.cleanupState(true /* hostOnly */)
-	n.mu.ndp.stopSolicitingRouters()
-}
-
-// becomeIPv6Host transitions n into an IPv6 host.
-//
-// When transitioning into an IPv6 host, NDP router solicitations will be
-// started.
-func (n *NIC) becomeIPv6Host() {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.startSolicitingRouters()
+	n.LinkEndpoint.Attach(nil)
+	return nil
 }
 
 // setPromiscuousMode enables or disables promiscuous mode.
@@ -413,217 +239,122 @@ func (n *NIC) isPromiscuousMode() bool {
 	return rv
 }
 
-func (n *NIC) isLoopback() bool {
-	return n.linkEP.Capabilities()&CapabilityLoopback != 0
-}
-
-// setSpoofing enables or disables address spoofing.
-func (n *NIC) setSpoofing(enable bool) {
-	n.mu.Lock()
-	n.mu.spoofing = enable
-	n.mu.Unlock()
+// IsLoopback implements NetworkInterface.
+func (n *NIC) IsLoopback() bool {
+	return n.LinkEndpoint.Capabilities()&CapabilityLoopback != 0
 }
 
-// primaryEndpoint will return the first non-deprecated endpoint if such an
-// endpoint exists for the given protocol and remoteAddr. If no non-deprecated
-// endpoint exists, the first deprecated endpoint will be returned.
-//
-// If an IPv6 primary endpoint is requested, Source Address Selection (as
-// defined by RFC 6724 section 5) will be performed.
-func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
-		return n.primaryIPv6Endpoint(remoteAddr)
-	}
-
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	var deprecatedEndpoint *referencedNetworkEndpoint
-	for _, r := range n.mu.primary[protocol] {
-		if !r.isValidForOutgoingRLocked() {
-			continue
-		}
-
-		if !r.deprecated {
-			if r.tryIncRef() {
-				// r is not deprecated, so return it immediately.
-				//
-				// If we kept track of a deprecated endpoint, decrement its reference
-				// count since it was incremented when we decided to keep track of it.
-				if deprecatedEndpoint != nil {
-					deprecatedEndpoint.decRefLocked()
-					deprecatedEndpoint = nil
-				}
-
-				return r
-			}
-		} else if deprecatedEndpoint == nil && r.tryIncRef() {
-			// We prefer an endpoint that is not deprecated, but we keep track of r in
-			// case n doesn't have any non-deprecated endpoints.
-			//
-			// If we end up finding a more preferred endpoint, r's reference count
-			// will be decremented when such an endpoint is found.
-			deprecatedEndpoint = r
+// WritePacket implements NetworkLinkEndpoint.
+func (n *NIC) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	// As per relevant RFCs, we should queue packets while we wait for link
+	// resolution to complete.
+	//
+	// RFC 1122 section 2.3.2.2 (for IPv4):
+	//   The link layer SHOULD save (rather than discard) at least
+	//   one (the latest) packet of each set of packets destined to
+	//   the same unresolved IP address, and transmit the saved
+	//   packet when the address has been resolved.
+	//
+	// RFC 4861 section 5.2 (for IPv6):
+	//   Once the IP address of the next-hop node is known, the sender
+	//   examines the Neighbor Cache for link-layer information about that
+	//   neighbor.  If no entry exists, the sender creates one, sets its state
+	//   to INCOMPLETE, initiates Address Resolution, and then queues the data
+	//   packet pending completion of address resolution.
+	if ch, err := r.Resolve(nil); err != nil {
+		if err == tcpip.ErrWouldBlock {
+			r := r.Clone()
+			n.stack.linkResQueue.enqueue(ch, &r, protocol, pkt)
+			return nil
 		}
+		return err
 	}
 
-	// n doesn't have any valid non-deprecated endpoints, so return
-	// deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated
-	// endpoints either).
-	return deprecatedEndpoint
-}
-
-// ipv6AddrCandidate is an IPv6 candidate for Source Address Selection (RFC
-// 6724 section 5).
-type ipv6AddrCandidate struct {
-	ref   *referencedNetworkEndpoint
-	scope header.IPv6AddressScope
+	return n.writePacket(r, gso, protocol, pkt)
 }
 
-// primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
-// Selection (RFC 6724 section 5).
-//
-// Note, only rules 1-3 and 7 are followed.
-//
-// remoteAddr must be a valid IPv6 address.
-func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	n.mu.RLock()
-	ref := n.primaryIPv6EndpointRLocked(remoteAddr)
-	n.mu.RUnlock()
-	return ref
+// WritePacketToRemote implements NetworkInterface.
+func (n *NIC) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	r := Route{
+		NetProto:          protocol,
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+	return n.writePacket(&r, gso, protocol, pkt)
 }
 
-// primaryIPv6EndpointLocked returns an IPv6 endpoint following Source Address
-// Selection (RFC 6724 section 5).
-//
-// Note, only rules 1-3 and 7 are followed.
-//
-// remoteAddr must be a valid IPv6 address.
-//
-// n.mu MUST be read locked.
-func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
-	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
+func (n *NIC) writePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Size()
 
-	if len(primaryAddrs) == 0 {
-		return nil
+	if err := n.LinkEndpoint.WritePacket(r, gso, protocol, pkt); err != nil {
+		return err
 	}
 
-	// Create a candidate set of available addresses we can potentially use as a
-	// source address.
-	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
-	for _, r := range primaryAddrs {
-		// If r is not valid for outgoing connections, it is not a valid endpoint.
-		if !r.isValidForOutgoingRLocked() {
-			continue
-		}
-
-		addr := r.address()
-		scope, err := header.ScopeForIPv6Address(addr)
-		if err != nil {
-			// Should never happen as we got r from the primary IPv6 endpoint list and
-			// ScopeForIPv6Address only returns an error if addr is not an IPv6
-			// address.
-			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
-		}
-
-		cs = append(cs, ipv6AddrCandidate{
-			ref:   r,
-			scope: scope,
-		})
-	}
+	n.stats.Tx.Packets.Increment()
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+	return nil
+}
 
-	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
-	if err != nil {
-		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
-		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
+// WritePackets implements NetworkLinkEndpoint.
+func (n *NIC) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// TODO(gvisor.dev/issue/4458): Queue packets whie link address resolution
+	// is being peformed like WritePacket.
+	writtenPackets, err := n.LinkEndpoint.WritePackets(r, gso, pkts, protocol)
+	n.stats.Tx.Packets.IncrementBy(uint64(writtenPackets))
+	writtenBytes := 0
+	for i, pb := 0, pkts.Front(); i < writtenPackets && pb != nil; i, pb = i+1, pb.Next() {
+		writtenBytes += pb.Size()
 	}
 
-	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
-	//
-	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
-	sort.Slice(cs, func(i, j int) bool {
-		sa := cs[i]
-		sb := cs[j]
-
-		// Prefer same address as per RFC 6724 section 5 rule 1.
-		if sa.ref.address() == remoteAddr {
-			return true
-		}
-		if sb.ref.address() == remoteAddr {
-			return false
-		}
-
-		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
-		if sa.scope < sb.scope {
-			return sa.scope >= remoteScope
-		} else if sb.scope < sa.scope {
-			return sb.scope < remoteScope
-		}
-
-		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
-		if saDep, sbDep := sa.ref.deprecated, sb.ref.deprecated; saDep != sbDep {
-			// If sa is not deprecated, it is preferred over sb.
-			return sbDep
-		}
-
-		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
-		if saTemp, sbTemp := sa.ref.configType == slaacTemp, sb.ref.configType == slaacTemp; saTemp != sbTemp {
-			return saTemp
-		}
-
-		// sa and sb are equal, return the endpoint that is closest to the front of
-		// the primary endpoint list.
-		return i < j
-	})
-
-	// Return the most preferred address that can have its reference count
-	// incremented.
-	for _, c := range cs {
-		if r := c.ref; r.tryIncRef() {
-			return r
-		}
-	}
+	n.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
+	return writtenPackets, err
+}
 
-	return nil
+// setSpoofing enables or disables address spoofing.
+func (n *NIC) setSpoofing(enable bool) {
+	n.mu.Lock()
+	n.mu.spoofing = enable
+	n.mu.Unlock()
 }
 
-// hasPermanentAddrLocked returns true if n has a permanent (including currently
-// tentative) address, addr.
-func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+// primaryAddress returns an address that can be used to communicate with
+// remoteAddr.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) AssignableAddressEndpoint {
+	n.mu.RLock()
+	spoofing := n.mu.spoofing
+	n.mu.RUnlock()
 
+	ep, ok := n.networkEndpoints[protocol]
 	if !ok {
-		return false
+		return nil
 	}
 
-	kind := ref.getKind()
-
-	return kind == permanent || kind == permanentTentative
+	return ep.AcquireOutgoingPrimaryAddress(remoteAddr, spoofing)
 }
 
-type getRefBehaviour int
+type getAddressBehaviour int
 
 const (
 	// spoofing indicates that the NIC's spoofing flag should be observed when
-	// getting a NIC's referenced network endpoint.
-	spoofing getRefBehaviour = iota
+	// getting a NIC's address endpoint.
+	spoofing getAddressBehaviour = iota
 
 	// promiscuous indicates that the NIC's promiscuous flag should be observed
-	// when getting a NIC's referenced network endpoint.
+	// when getting a NIC's address endpoint.
 	promiscuous
 )
 
-func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
+func (n *NIC) getAddress(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) AssignableAddressEndpoint {
+	return n.getAddressOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
 }
 
 // findEndpoint finds the endpoint, if any, with the given address.
-func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
-	return n.getRefOrCreateTemp(protocol, address, peb, spoofing)
+func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
+	return n.getAddressOrCreateTemp(protocol, address, peb, spoofing)
 }
 
-// getRefEpOrCreateTemp returns the referenced network endpoint for the given
-// protocol and address.
+// getAddressEpOrCreateTemp returns the address endpoint for the given protocol
+// and address.
 //
 // If none exists a temporary one may be created if we are in promiscuous mode
 // or spoofing. Promiscuous mode will only be checked if promiscuous is true.
@@ -631,9 +362,8 @@ func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.A
 //
 // If the address is the IPv4 broadcast address for an endpoint's network, that
 // endpoint will be returned.
-func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
+func (n *NIC) getAddressOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getAddressBehaviour) AssignableAddressEndpoint {
 	n.mu.RLock()
-
 	var spoofingOrPromiscuous bool
 	switch tempRef {
 	case spoofing:
@@ -641,282 +371,54 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	case promiscuous:
 		spoofingOrPromiscuous = n.mu.promiscuous
 	}
-
-	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
-		// An endpoint with this id exists, check if it can be used and return it.
-		if !ref.isAssignedRLocked(spoofingOrPromiscuous) {
-			n.mu.RUnlock()
-			return nil
-		}
-
-		if ref.tryIncRef() {
-			n.mu.RUnlock()
-			return ref
-		}
-	}
-
-	// Check if address is a broadcast address for the endpoint's network.
-	//
-	// Only IPv4 has a notion of broadcast addresses.
-	if protocol == header.IPv4ProtocolNumber {
-		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
-			n.mu.RUnlock()
-			return ref
-		}
-	}
-
-	// A usable reference was not found, create a temporary one if requested by
-	// the caller or if the address is found in the NIC's subnets.
-	createTempEP := spoofingOrPromiscuous
 	n.mu.RUnlock()
-
-	if !createTempEP {
-		return nil
-	}
-
-	// Try again with the lock in exclusive mode. If we still can't get the
-	// endpoint, create a new "temporary" endpoint. It will only exist while
-	// there's a route through it.
-	n.mu.Lock()
-	ref := n.getRefOrCreateTempLocked(protocol, address, peb)
-	n.mu.Unlock()
-	return ref
+	return n.getAddressOrCreateTempInner(protocol, address, spoofingOrPromiscuous, peb)
 }
 
-// getRefForBroadcastLocked returns an endpoint where address is the IPv4
-// broadcast address for the endpoint's network.
-//
-// n.mu MUST be read locked.
-func (n *NIC) getRefForBroadcastRLocked(address tcpip.Address) *referencedNetworkEndpoint {
-	for _, ref := range n.mu.endpoints {
-		// Only IPv4 has a notion of broadcast addresses.
-		if ref.protocol != header.IPv4ProtocolNumber {
-			continue
-		}
-
-		addr := ref.addrWithPrefix()
-		subnet := addr.Subnet()
-		if subnet.IsBroadcast(address) && ref.tryIncRef() {
-			return ref
-		}
+// getAddressOrCreateTempInner is like getAddressEpOrCreateTemp except a boolean
+// is passed to indicate whether or not we should generate temporary endpoints.
+func (n *NIC) getAddressOrCreateTempInner(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, createTemp bool, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
+	if ep, ok := n.networkEndpoints[protocol]; ok {
+		return ep.AcquireAssignedAddress(address, createTemp, peb)
 	}
 
 	return nil
 }
 
-/// getRefOrCreateTempLocked returns an existing endpoint for address or creates
-/// and returns a temporary endpoint.
-//
-// If the address is the IPv4 broadcast address for an endpoint's network, that
-// endpoint will be returned.
-//
-// n.mu must be write locked.
-func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
-	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
-		// No need to check the type as we are ok with expired endpoints at this
-		// point.
-		if ref.tryIncRef() {
-			return ref
-		}
-		// tryIncRef failing means the endpoint is scheduled to be removed once the
-		// lock is released. Remove it here so we can create a new (temporary) one.
-		// The removal logic waiting for the lock handles this case.
-		n.removeEndpointLocked(ref)
-	}
-
-	// Check if address is a broadcast address for an endpoint's network.
-	//
-	// Only IPv4 has a notion of broadcast addresses.
-	if protocol == header.IPv4ProtocolNumber {
-		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
-			return ref
-		}
-	}
-
-	// Add a new temporary endpoint.
-	netProto, ok := n.stack.networkProtocols[protocol]
-	if !ok {
-		return nil
-	}
-	ref, _ := n.addAddressLocked(tcpip.ProtocolAddress{
-		Protocol: protocol,
-		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   address,
-			PrefixLen: netProto.DefaultPrefixLen(),
-		},
-	}, peb, temporary, static, false)
-	return ref
-}
-
-// addAddressLocked adds a new protocolAddress to n.
-//
-// If n already has the address in a non-permanent state, and the kind given is
-// permanent, that address will be promoted in place and its properties set to
-// the properties provided. Otherwise, it returns tcpip.ErrDuplicateAddress.
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
-	// TODO(b/141022673): Validate IP addresses before adding them.
-
-	// Sanity check.
-	id := NetworkEndpointID{LocalAddress: protocolAddress.AddressWithPrefix.Address}
-	if ref, ok := n.mu.endpoints[id]; ok {
-		// Endpoint already exists.
-		if kind != permanent {
-			return nil, tcpip.ErrDuplicateAddress
-		}
-		switch ref.getKind() {
-		case permanentTentative, permanent:
-			// The NIC already have a permanent endpoint with that address.
-			return nil, tcpip.ErrDuplicateAddress
-		case permanentExpired, temporary:
-			// Promote the endpoint to become permanent and respect the new peb,
-			// configType and deprecated status.
-			if ref.tryIncRef() {
-				// TODO(b/147748385): Perform Duplicate Address Detection when promoting
-				// an IPv6 endpoint to permanent.
-				ref.setKind(permanent)
-				ref.deprecated = deprecated
-				ref.configType = configType
-
-				refs := n.mu.primary[ref.protocol]
-				for i, r := range refs {
-					if r == ref {
-						switch peb {
-						case CanBePrimaryEndpoint:
-							return ref, nil
-						case FirstPrimaryEndpoint:
-							if i == 0 {
-								return ref, nil
-							}
-							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-						case NeverPrimaryEndpoint:
-							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-							return ref, nil
-						}
-					}
-				}
-
-				n.insertPrimaryEndpointLocked(ref, peb)
-
-				return ref, nil
-			}
-			// tryIncRef failing means the endpoint is scheduled to be removed once
-			// the lock is released. Remove it here so we can create a new
-			// (permanent) one. The removal logic waiting for the lock handles this
-			// case.
-			n.removeEndpointLocked(ref)
-		}
-	}
-
+// addAddress adds a new address to n, so that it starts accepting packets
+// targeted at the given address (and network protocol).
+func (n *NIC) addAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
 	ep, ok := n.networkEndpoints[protocolAddress.Protocol]
 	if !ok {
-		return nil, tcpip.ErrUnknownProtocol
-	}
-
-	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
-
-	// If the address is an IPv6 address and it is a permanent address,
-	// mark it as tentative so it goes through the DAD process if the NIC is
-	// enabled. If the NIC is not enabled, DAD will be started when the NIC is
-	// enabled.
-	if isIPv6Unicast && kind == permanent {
-		kind = permanentTentative
-	}
-
-	ref := &referencedNetworkEndpoint{
-		refs:       1,
-		addr:       protocolAddress.AddressWithPrefix,
-		ep:         ep,
-		nic:        n,
-		protocol:   protocolAddress.Protocol,
-		kind:       kind,
-		configType: configType,
-		deprecated: deprecated,
-	}
-
-	// Set up cache if link address resolution exists for this protocol.
-	if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 {
-		if _, ok := n.stack.linkAddrResolvers[protocolAddress.Protocol]; ok {
-			ref.linkCache = n.stack
-		}
-	}
-
-	// If we are adding an IPv6 unicast address, join the solicited-node
-	// multicast address.
-	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(protocolAddress.AddressWithPrefix.Address)
-		if err := n.joinGroupLocked(protocolAddress.Protocol, snmc); err != nil {
-			return nil, err
-		}
+		return tcpip.ErrUnknownProtocol
 	}
 
-	n.mu.endpoints[id] = ref
-
-	n.insertPrimaryEndpointLocked(ref, peb)
-
-	// If we are adding a tentative IPv6 address, start DAD if the NIC is enabled.
-	if isIPv6Unicast && kind == permanentTentative && n.mu.enabled {
-		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
-			return nil, err
-		}
+	addressEndpoint, err := ep.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, peb, AddressConfigStatic, false /* deprecated */)
+	if err == nil {
+		// We have no need for the address endpoint.
+		addressEndpoint.DecRef()
 	}
-
-	return ref, nil
-}
-
-// AddAddress adds a new address to n, so that it starts accepting packets
-// targeted at the given address (and network protocol).
-func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
-	// Add the endpoint.
-	n.mu.Lock()
-	_, err := n.addAddressLocked(protocolAddress, peb, permanent, static, false /* deprecated */)
-	n.mu.Unlock()
-
 	return err
 }
 
-// AllAddresses returns all addresses (primary and non-primary) associated with
+// allPermanentAddresses returns all permanent addresses associated with
 // this NIC.
-func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	addrs := make([]tcpip.ProtocolAddress, 0, len(n.mu.endpoints))
-	for _, ref := range n.mu.endpoints {
-		// Don't include tentative, expired or temporary endpoints to
-		// avoid confusion and prevent the caller from using those.
-		switch ref.getKind() {
-		case permanentExpired, temporary:
-			continue
+func (n *NIC) allPermanentAddresses() []tcpip.ProtocolAddress {
+	var addrs []tcpip.ProtocolAddress
+	for p, ep := range n.networkEndpoints {
+		for _, a := range ep.PermanentAddresses() {
+			addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
 		}
-
-		addrs = append(addrs, tcpip.ProtocolAddress{
-			Protocol:          ref.protocol,
-			AddressWithPrefix: ref.addrWithPrefix(),
-		})
 	}
 	return addrs
 }
 
-// PrimaryAddresses returns the primary addresses associated with this NIC.
-func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
+// primaryAddresses returns the primary addresses associated with this NIC.
+func (n *NIC) primaryAddresses() []tcpip.ProtocolAddress {
 	var addrs []tcpip.ProtocolAddress
-	for proto, list := range n.mu.primary {
-		for _, ref := range list {
-			// Don't include tentative, expired or tempory endpoints
-			// to avoid confusion and prevent the caller from using
-			// those.
-			switch ref.getKind() {
-			case permanentTentative, permanentExpired, temporary:
-				continue
-			}
-
-			addrs = append(addrs, tcpip.ProtocolAddress{
-				Protocol:          proto,
-				AddressWithPrefix: ref.addrWithPrefix(),
-			})
+	for p, ep := range n.networkEndpoints {
+		for _, a := range ep.PrimaryAddresses() {
+			addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
 		}
 	}
 	return addrs
@@ -928,237 +430,135 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
 // address exists. If no non-deprecated address exists, the first deprecated
 // address will be returned.
 func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	list, ok := n.mu.primary[proto]
+	ep, ok := n.networkEndpoints[proto]
 	if !ok {
 		return tcpip.AddressWithPrefix{}
 	}
 
-	var deprecatedEndpoint *referencedNetworkEndpoint
-	for _, ref := range list {
-		// Don't include tentative, expired or tempory endpoints to avoid confusion
-		// and prevent the caller from using those.
-		switch ref.getKind() {
-		case permanentTentative, permanentExpired, temporary:
-			continue
-		}
-
-		if !ref.deprecated {
-			return ref.addrWithPrefix()
-		}
+	return ep.MainAddress()
+}
 
-		if deprecatedEndpoint == nil {
-			deprecatedEndpoint = ref
+// removeAddress removes an address from n.
+func (n *NIC) removeAddress(addr tcpip.Address) *tcpip.Error {
+	for _, ep := range n.networkEndpoints {
+		if err := ep.RemovePermanentAddress(addr); err == tcpip.ErrBadLocalAddress {
+			continue
+		} else {
+			return err
 		}
 	}
 
-	if deprecatedEndpoint != nil {
-		return deprecatedEndpoint.addrWithPrefix()
-	}
-
-	return tcpip.AddressWithPrefix{}
+	return tcpip.ErrBadLocalAddress
 }
 
-// insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
-// by peb.
-//
-// n MUST be locked.
-func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
-	switch peb {
-	case CanBePrimaryEndpoint:
-		n.mu.primary[r.protocol] = append(n.mu.primary[r.protocol], r)
-	case FirstPrimaryEndpoint:
-		n.mu.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.mu.primary[r.protocol]...)
+func (n *NIC) neighbors() ([]NeighborEntry, *tcpip.Error) {
+	if n.neigh == nil {
+		return nil, tcpip.ErrNotSupported
 	}
-}
 
-func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
-	id := NetworkEndpointID{LocalAddress: r.address()}
+	return n.neigh.entries(), nil
+}
 
-	// Nothing to do if the reference has already been replaced with a different
-	// one. This happens in the case where 1) this endpoint's ref count hit zero
-	// and was waiting (on the lock) to be removed and 2) the same address was
-	// re-added in the meantime by removing this endpoint from the list and
-	// adding a new one.
-	if n.mu.endpoints[id] != r {
+func (n *NIC) removeWaker(addr tcpip.Address, w *sleep.Waker) {
+	if n.neigh == nil {
 		return
 	}
 
-	if r.getKind() == permanent {
-		panic("Reference count dropped to zero before being removed")
-	}
+	n.neigh.removeWaker(addr, w)
+}
 
-	delete(n.mu.endpoints, id)
-	refs := n.mu.primary[r.protocol]
-	for i, ref := range refs {
-		if ref == r {
-			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
-			refs[len(refs)-1] = nil
-			break
-		}
+func (n *NIC) addStaticNeighbor(addr tcpip.Address, linkAddress tcpip.LinkAddress) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
-}
 
-func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
-	n.mu.Lock()
-	n.removeEndpointLocked(r)
-	n.mu.Unlock()
+	n.neigh.addStaticEntry(addr, linkAddress)
+	return nil
 }
 
-func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
-	r, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return tcpip.ErrBadLocalAddress
-	}
-
-	kind := r.getKind()
-	if kind != permanent && kind != permanentTentative {
-		return tcpip.ErrBadLocalAddress
+func (n *NIC) removeNeighbor(addr tcpip.Address) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
 
-	switch r.protocol {
-	case header.IPv6ProtocolNumber:
-		return n.removePermanentIPv6EndpointLocked(r, true /* allowSLAACInvalidation */)
-	default:
-		r.expireLocked()
-		return nil
+	if !n.neigh.removeEntry(addr) {
+		return tcpip.ErrBadAddress
 	}
+	return nil
 }
 
-func (n *NIC) removePermanentIPv6EndpointLocked(r *referencedNetworkEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
-	addr := r.addrWithPrefix()
-
-	isIPv6Unicast := header.IsV6UnicastAddress(addr.Address)
-
-	if isIPv6Unicast {
-		n.mu.ndp.stopDuplicateAddressDetection(addr.Address)
-
-		// If we are removing an address generated via SLAAC, cleanup
-		// its SLAAC resources and notify the integrator.
-		switch r.configType {
-		case slaac:
-			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
-		case slaacTemp:
-			n.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
-		}
-	}
-
-	r.expireLocked()
-
-	// At this point the endpoint is deleted.
-
-	// If we are removing an IPv6 unicast address, leave the solicited-node
-	// multicast address.
-	//
-	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
-	// multicast group may be left by user action.
-	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(addr.Address)
-		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
-			return err
-		}
+func (n *NIC) clearNeighbors() *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
 	}
 
+	n.neigh.clear()
 	return nil
 }
 
-// RemoveAddress removes an address from n.
-func (n *NIC) RemoveAddress(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-	return n.removePermanentAddressLocked(addr)
-}
-
 // joinGroup adds a new endpoint for the given multicast address, if none
 // exists yet. Otherwise it just increments its count.
 func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	return n.joinGroupLocked(protocol, addr)
-}
-
-// joinGroupLocked adds a new endpoint for the given multicast address, if none
-// exists yet. Otherwise it just increments its count. n MUST be locked before
-// joinGroupLocked is called.
-func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
 	// TODO(b/143102137): When implementing MLD, make sure MLD packets are
 	// not sent unless a valid link-local address is available for use on n
 	// as an MLD packet's source address must be a link-local address as
 	// outlined in RFC 3810 section 5.
 
-	id := NetworkEndpointID{addr}
-	joins := n.mu.mcastJoins[id]
-	if joins == 0 {
-		netProto, ok := n.stack.networkProtocols[protocol]
-		if !ok {
-			return tcpip.ErrUnknownProtocol
-		}
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol: protocol,
-			AddressWithPrefix: tcpip.AddressWithPrefix{
-				Address:   addr,
-				PrefixLen: netProto.DefaultPrefixLen(),
-			},
-		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
-			return err
-		}
+	ep, ok := n.networkEndpoints[protocol]
+	if !ok {
+		return tcpip.ErrNotSupported
 	}
-	n.mu.mcastJoins[id] = joins + 1
-	return nil
+
+	gep, ok := ep.(GroupAddressableEndpoint)
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
+
+	_, err := gep.JoinGroup(addr)
+	return err
 }
 
 // leaveGroup decrements the count for the given multicast address, and when it
 // reaches zero removes the endpoint for this address.
-func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	return n.leaveGroupLocked(addr, false /* force */)
-}
+func (n *NIC) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	ep, ok := n.networkEndpoints[protocol]
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
 
-// leaveGroupLocked decrements the count for the given multicast address, and
-// when it reaches zero removes the endpoint for this address. n MUST be locked
-// before leaveGroupLocked is called.
-//
-// If force is true, then the count for the multicast addres is ignored and the
-// endpoint will be removed immediately.
-func (n *NIC) leaveGroupLocked(addr tcpip.Address, force bool) *tcpip.Error {
-	id := NetworkEndpointID{addr}
-	joins, ok := n.mu.mcastJoins[id]
+	gep, ok := ep.(GroupAddressableEndpoint)
 	if !ok {
-		// There are no joins with this address on this NIC.
-		return tcpip.ErrBadLocalAddress
+		return tcpip.ErrNotSupported
 	}
 
-	joins--
-	if force || joins == 0 {
-		// There are no outstanding joins or we are forced to leave, clean up.
-		delete(n.mu.mcastJoins, id)
-		return n.removePermanentAddressLocked(addr)
+	if _, err := gep.LeaveGroup(addr); err != nil {
+		return err
 	}
 
-	n.mu.mcastJoins[id] = joins
 	return nil
 }
 
 // isInGroup returns true if n has joined the multicast group addr.
 func (n *NIC) isInGroup(addr tcpip.Address) bool {
-	n.mu.RLock()
-	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
-	n.mu.RUnlock()
+	for _, ep := range n.networkEndpoints {
+		gep, ok := ep.(GroupAddressableEndpoint)
+		if !ok {
+			continue
+		}
+
+		if gep.IsInGroup(addr) {
+			return true
+		}
+	}
 
-	return joins != 0
+	return false
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
-	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
+func (n *NIC) handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, remotelinkAddr tcpip.LinkAddress, addressEndpoint AssignableAddressEndpoint, pkt *PacketBuffer) {
+	r := makeRoute(protocol, dst, src, n, addressEndpoint, false /* handleLocal */, false /* multicastLoop */)
+	defer r.Release()
 	r.RemoteLinkAddress = remotelinkAddr
-
-	ref.ep.HandlePacket(&r, pkt)
-	ref.decRef()
+	n.getNetworkEndpoint(protocol).HandlePacket(&r, pkt)
 }
 
 // DeliverNetworkPacket finds the appropriate network protocol endpoint and
@@ -1169,7 +569,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // the ownership of the items is not retained by the caller.
 func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	n.mu.RLock()
-	enabled := n.mu.enabled
+	enabled := n.Enabled()
 	// If the NIC is not yet enabled, don't receive any packets.
 	if !enabled {
 		n.mu.RUnlock()
@@ -1192,12 +592,12 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 	// If no local link layer address is provided, assume it was sent
 	// directly to this NIC.
 	if local == "" {
-		local = n.linkEP.LinkAddress()
+		local = n.LinkEndpoint.LinkAddress()
 	}
 
-	// Are any packet sockets listening for this network protocol?
+	// Are any packet type sockets listening for this network protocol?
 	packetEPs := n.mu.packetEPs[protocol]
-	// Add any other packet sockets that maybe listening for all protocols.
+	// Add any other packet type sockets that may be listening for all protocols.
 	packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
@@ -1218,6 +618,7 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 		return
 	}
 	if hasTransportHdr {
+		pkt.TransportProtocolNumber = transProtoNum
 		// Parse the transport header if present.
 		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
 			state.proto.Parse(pkt)
@@ -1226,29 +627,33 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 
 	src, dst := netProto.ParseAddresses(pkt.NetworkHeader().View())
 
-	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
-		// The source address is one of our own, so we never should have gotten a
-		// packet like this unless handleLocal is false. Loopback also calls this
-		// function even though the packets didn't come from the physical interface
-		// so don't drop those.
-		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
-		return
+	if n.stack.handleLocal && !n.IsLoopback() {
+		if r := n.getAddress(protocol, src); r != nil {
+			r.DecRef()
+
+			// The source address is one of our own, so we never should have gotten a
+			// packet like this unless handleLocal is false. Loopback also calls this
+			// function even though the packets didn't come from the physical interface
+			// so don't drop those.
+			n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
+			return
+		}
 	}
 
-	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
 	// Loopback traffic skips the prerouting chain.
-	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
+	if !n.IsLoopback() {
 		// iptables filtering.
 		ipt := n.stack.IPTables()
 		address := n.primaryAddress(protocol)
 		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
 			// iptables is telling us to drop the packet.
+			n.stack.stats.IP.IPTablesPreroutingDropped.Increment()
 			return
 		}
 	}
 
-	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
+	if addressEndpoint := n.getAddress(protocol, dst); addressEndpoint != nil {
+		n.handlePacket(protocol, dst, src, remote, addressEndpoint, pkt)
 		return
 	}
 
@@ -1256,7 +661,7 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 	// packet and forward it to the NIC.
 	//
 	// TODO: Should we be forwarding the packet even if promiscuous?
-	if n.stack.Forwarding() {
+	if n.stack.Forwarding(protocol) {
 		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
 		if err != nil {
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
@@ -1264,38 +669,41 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
 		}
 
 		// Found a NIC.
-		n := r.ref.nic
-		n.mu.RLock()
-		ref, ok := n.mu.endpoints[NetworkEndpointID{dst}]
-		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
-		n.mu.RUnlock()
-		if ok {
-			r.LocalLinkAddress = n.linkEP.LinkAddress()
-			r.RemoteLinkAddress = remote
-			r.RemoteAddress = src
-			// TODO(b/123449044): Update the source NIC as well.
-			ref.ep.HandlePacket(&r, pkt)
-			ref.decRef()
-			r.Release()
-			return
+		n := r.nic
+		if addressEndpoint := n.getAddressOrCreateTempInner(protocol, dst, false, NeverPrimaryEndpoint); addressEndpoint != nil {
+			if n.isValidForOutgoing(addressEndpoint) {
+				r.LocalLinkAddress = n.LinkEndpoint.LinkAddress()
+				r.RemoteLinkAddress = remote
+				r.RemoteAddress = src
+				// TODO(b/123449044): Update the source NIC as well.
+				n.getNetworkEndpoint(protocol).HandlePacket(&r, pkt)
+				addressEndpoint.DecRef()
+				r.Release()
+				return
+			}
+
+			addressEndpoint.DecRef()
 		}
 
 		// n doesn't have a destination endpoint.
 		// Send the packet out of n.
-		// TODO(b/128629022): move this logic to route.WritePacket.
-		if ch, err := r.Resolve(nil); err != nil {
-			if err == tcpip.ErrWouldBlock {
-				n.stack.forwarder.enqueue(ch, n, &r, protocol, pkt)
-				// forwarder will release route.
-				return
-			}
+		// TODO(gvisor.dev/issue/1085): According to the RFC, we must decrease the TTL field for ipv4/ipv6.
+
+		// pkt may have set its header and may not have enough headroom for
+		// link-layer header for the other link to prepend. Here we create a new
+		// packet to forward.
+		fwdPkt := NewPacketBuffer(PacketBufferOptions{
+			ReserveHeaderBytes: int(n.LinkEndpoint.MaxHeaderLength()),
+			// We need to do a deep copy of the IP packet because WritePacket (and
+			// friends) take ownership of the packet buffer, but we do not own it.
+			Data: PayloadSince(pkt.NetworkHeader()).ToVectorisedView(),
+		})
+
+		// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+		if err := n.WritePacket(&r, nil, protocol, fwdPkt); err != nil {
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
-			r.Release()
-			return
 		}
 
-		// The link-address resolution finished immediately.
-		n.forwardPacket(&r, protocol, pkt)
 		r.Release()
 		return
 	}
@@ -1319,41 +727,18 @@ func (n *NIC) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tc
 		p.PktType = tcpip.PacketOutgoing
 		// Add the link layer header as outgoing packets are intercepted
 		// before the link layer header is created.
-		n.linkEP.AddHeader(local, remote, protocol, p)
+		n.LinkEndpoint.AddHeader(local, remote, protocol, p)
 		ep.HandlePacket(n.id, local, protocol, p)
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-
-	// pkt may have set its header and may not have enough headroom for link-layer
-	// header for the other link to prepend. Here we create a new packet to
-	// forward.
-	fwdPkt := NewPacketBuffer(PacketBufferOptions{
-		ReserveHeaderBytes: int(n.linkEP.MaxHeaderLength()),
-		Data:               buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
-	})
-
-	// WritePacket takes ownership of fwdPkt, calculate numBytes first.
-	numBytes := fwdPkt.Size()
-
-	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, fwdPkt); err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-		return
-	}
-
-	n.stats.Tx.Packets.Increment()
-	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-}
-
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
-		return
+		return TransportPacketProtocolUnreachable
 	}
 
 	transProto := state.proto
@@ -1374,41 +759,47 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 			// we parse it using the minimum size.
 			if _, ok := pkt.TransportHeader().Consume(transProto.MinimumPacketSize()); !ok {
 				n.stack.stats.MalformedRcvdPackets.Increment()
-				return
+				// We consider a malformed transport packet handled because there is
+				// nothing the caller can do.
+				return TransportPacketHandled
 			}
-		} else {
-			// This is either a bad packet or was re-assembled from fragments.
-			transProto.Parse(pkt)
+		} else if !transProto.Parse(pkt) {
+			n.stack.stats.MalformedRcvdPackets.Increment()
+			return TransportPacketHandled
 		}
 	}
 
-	if pkt.TransportHeader().View().Size() < transProto.MinimumPacketSize() {
-		n.stack.stats.MalformedRcvdPackets.Increment()
-		return
-	}
-
 	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader().View())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
-		return
+		return TransportPacketHandled
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
 	if n.stack.demux.deliverPacket(r, protocol, pkt, id) {
-		return
+		return TransportPacketHandled
 	}
 
 	// Try to deliver to per-stack default handler.
 	if state.defaultHandler != nil {
 		if state.defaultHandler(r, id, pkt) {
-			return
+			return TransportPacketHandled
 		}
 	}
 
-	// We could not find an appropriate destination for this packet, so
-	// deliver it to the global handler.
-	if !transProto.HandleUnknownDestinationPacket(r, id, pkt) {
+	// We could not find an appropriate destination for this packet so
+	// give the protocol specific error handler a chance to handle it.
+	// If it doesn't handle it then we should do so.
+	switch res := transProto.HandleUnknownDestinationPacket(r, id, pkt); res {
+	case UnknownDestinationPacketMalformed:
 		n.stack.stats.MalformedRcvdPackets.Increment()
+		return TransportPacketHandled
+	case UnknownDestinationPacketUnhandled:
+		return TransportPacketDestinationPortUnreachable
+	case UnknownDestinationPacketHandled:
+		return TransportPacketHandled
+	default:
+		panic(fmt.Sprintf("unrecognized result from HandleUnknownDestinationPacket = %d", res))
 	}
 }
 
@@ -1441,96 +832,18 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	}
 }
 
-// ID returns the identifier of n.
+// ID implements NetworkInterface.
 func (n *NIC) ID() tcpip.NICID {
 	return n.id
 }
 
-// Name returns the name of n.
+// Name implements NetworkInterface.
 func (n *NIC) Name() string {
 	return n.name
 }
 
-// Stack returns the instance of the Stack that owns this NIC.
-func (n *NIC) Stack() *Stack {
-	return n.stack
-}
-
-// LinkEndpoint returns the link endpoint of n.
-func (n *NIC) LinkEndpoint() LinkEndpoint {
-	return n.linkEP
-}
-
-// isAddrTentative returns true if addr is tentative on n.
-//
-// Note that if addr is not associated with n, then this function will return
-// false. It will only return true if the address is associated with the NIC
-// AND it is tentative.
-func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
-	n.mu.RLock()
-	defer n.mu.RUnlock()
-
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return false
-	}
-
-	return ref.getKind() == permanentTentative
-}
-
-// dupTentativeAddrDetected attempts to inform n that a tentative addr is a
-// duplicate on a link.
-//
-// dupTentativeAddrDetected will remove the tentative address if it exists. If
-// the address was generated via SLAAC, an attempt will be made to generate a
-// new address.
-func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
-	if !ok {
-		return tcpip.ErrBadAddress
-	}
-
-	if ref.getKind() != permanentTentative {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as a
-	// new address will be generated for it.
-	if err := n.removePermanentIPv6EndpointLocked(ref, false /* allowSLAACInvalidation */); err != nil {
-		return err
-	}
-
-	prefix := ref.addrWithPrefix().Subnet()
-
-	switch ref.configType {
-	case slaac:
-		n.mu.ndp.regenerateSLAACAddr(prefix)
-	case slaacTemp:
-		// Do not reset the generation attempts counter for the prefix as the
-		// temporary address is being regenerated in response to a DAD conflict.
-		n.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
-	}
-
-	return nil
-}
-
-// setNDPConfigs sets the NDP configurations for n.
-//
-// Note, if c contains invalid NDP configuration values, it will be fixed to
-// use default values for the erroneous values.
-func (n *NIC) setNDPConfigs(c NDPConfigurations) {
-	c.validate()
-
-	n.mu.Lock()
-	n.mu.ndp.configs = c
-	n.mu.Unlock()
-}
-
-// NUDConfigs gets the NUD configurations for n.
-func (n *NIC) NUDConfigs() (NUDConfigurations, *tcpip.Error) {
+// nudConfigs gets the NUD configurations for n.
+func (n *NIC) nudConfigs() (NUDConfigurations, *tcpip.Error) {
 	if n.neigh == nil {
 		return NUDConfigurations{}, tcpip.ErrNotSupported
 	}
@@ -1550,49 +863,6 @@ func (n *NIC) setNUDConfigs(c NUDConfigurations) *tcpip.Error {
 	return nil
 }
 
-// handleNDPRA handles an NDP Router Advertisement message that arrived on n.
-func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	n.mu.ndp.handleRA(ip, ra)
-}
-
-type networkEndpointKind int32
-
-const (
-	// A permanentTentative endpoint is a permanent address that is not yet
-	// considered to be fully bound to an interface in the traditional
-	// sense. That is, the address is associated with a NIC, but packets
-	// destined to the address MUST NOT be accepted and MUST be silently
-	// dropped, and the address MUST NOT be used as a source address for
-	// outgoing packets. For IPv6, addresses will be of this kind until
-	// NDP's Duplicate Address Detection has resolved, or be deleted if
-	// the process results in detecting a duplicate address.
-	permanentTentative networkEndpointKind = iota
-
-	// A permanent endpoint is created by adding a permanent address (vs. a
-	// temporary one) to the NIC. Its reference count is biased by 1 to avoid
-	// removal when no route holds a reference to it. It is removed by explicitly
-	// removing the permanent address from the NIC.
-	permanent
-
-	// An expired permanent endpoint is a permanent endpoint that had its address
-	// removed from the NIC, and it is waiting to be removed once no more routes
-	// hold a reference to it. This is achieved by decreasing its reference count
-	// by 1. If its address is re-added before the endpoint is removed, its type
-	// changes back to permanent and its reference count increases by 1 again.
-	permanentExpired
-
-	// A temporary endpoint is created for spoofing outgoing packets, or when in
-	// promiscuous mode and accepting incoming packets that don't match any
-	// permanent endpoint. Its reference count is not biased by 1 and the
-	// endpoint is removed immediately when no more route holds a reference to
-	// it. A temporary endpoint can be promoted to permanent if its address
-	// is added permanently.
-	temporary
-)
-
 func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
@@ -1623,149 +893,12 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
 	}
 }
 
-type networkEndpointConfigType int32
-
-const (
-	// A statically configured endpoint is an address that was added by
-	// some user-specified action (adding an explicit address, joining a
-	// multicast group).
-	static networkEndpointConfigType = iota
-
-	// A SLAAC configured endpoint is an IPv6 endpoint that was added by
-	// SLAAC as per RFC 4862 section 5.5.3.
-	slaac
-
-	// A temporary SLAAC configured endpoint is an IPv6 endpoint that was added by
-	// SLAAC as per RFC 4941. Temporary SLAAC addresses are short-lived and are
-	// not expected to be valid (or preferred) forever; hence the term temporary.
-	slaacTemp
-)
-
-type referencedNetworkEndpoint struct {
-	ep       NetworkEndpoint
-	addr     tcpip.AddressWithPrefix
-	nic      *NIC
-	protocol tcpip.NetworkProtocolNumber
-
-	// linkCache is set if link address resolution is enabled for this
-	// protocol. Set to nil otherwise.
-	linkCache LinkAddressCache
-
-	// refs is counting references held for this endpoint. When refs hits zero it
-	// triggers the automatic removal of the endpoint from the NIC.
-	refs int32
-
-	// networkEndpointKind must only be accessed using {get,set}Kind().
-	kind networkEndpointKind
-
-	// configType is the method that was used to configure this endpoint.
-	// This must never change except during endpoint creation and promotion to
-	// permanent.
-	configType networkEndpointConfigType
-
-	// deprecated indicates whether or not the endpoint should be considered
-	// deprecated. That is, when deprecated is true, other endpoints that are not
-	// deprecated should be preferred.
-	deprecated bool
-}
-
-func (r *referencedNetworkEndpoint) address() tcpip.Address {
-	return r.addr.Address
-}
-
-func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
-	return r.addr
-}
-
-func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
-	return networkEndpointKind(atomic.LoadInt32((*int32)(&r.kind)))
-}
-
-func (r *referencedNetworkEndpoint) setKind(kind networkEndpointKind) {
-	atomic.StoreInt32((*int32)(&r.kind), int32(kind))
-}
-
 // isValidForOutgoing returns true if the endpoint can be used to send out a
 // packet. It requires the endpoint to not be marked expired (i.e., its address)
 // has been removed) unless the NIC is in spoofing mode, or temporary.
-func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
-	r.nic.mu.RLock()
-	defer r.nic.mu.RUnlock()
-
-	return r.isValidForOutgoingRLocked()
-}
-
-// isValidForOutgoingRLocked is the same as isValidForOutgoing but requires
-// r.nic.mu to be read locked.
-func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	if !r.nic.mu.enabled {
-		return false
-	}
-
-	return r.isAssignedRLocked(r.nic.mu.spoofing)
-}
-
-// isAssignedRLocked returns true if r is considered to be assigned to the NIC.
-//
-// r.nic.mu must be read locked.
-func (r *referencedNetworkEndpoint) isAssignedRLocked(spoofingOrPromiscuous bool) bool {
-	switch r.getKind() {
-	case permanentTentative:
-		return false
-	case permanentExpired:
-		return spoofingOrPromiscuous
-	default:
-		return true
-	}
-}
-
-// expireLocked decrements the reference count and marks the permanent endpoint
-// as expired.
-func (r *referencedNetworkEndpoint) expireLocked() {
-	r.setKind(permanentExpired)
-	r.decRefLocked()
-}
-
-// decRef decrements the ref count and cleans up the endpoint once it reaches
-// zero.
-func (r *referencedNetworkEndpoint) decRef() {
-	if atomic.AddInt32(&r.refs, -1) == 0 {
-		r.nic.removeEndpoint(r)
-	}
-}
-
-// decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
-// locked.
-func (r *referencedNetworkEndpoint) decRefLocked() {
-	if atomic.AddInt32(&r.refs, -1) == 0 {
-		r.nic.removeEndpointLocked(r)
-	}
-}
-
-// incRef increments the ref count. It must only be called when the caller is
-// known to be holding a reference to the endpoint, otherwise tryIncRef should
-// be used.
-func (r *referencedNetworkEndpoint) incRef() {
-	atomic.AddInt32(&r.refs, 1)
-}
-
-// tryIncRef attempts to increment the ref count from n to n+1, but only if n is
-// not zero. That is, it will increment the count if the endpoint is still
-// alive, and do nothing if it has already been clean up.
-func (r *referencedNetworkEndpoint) tryIncRef() bool {
-	for {
-		v := atomic.LoadInt32(&r.refs)
-		if v == 0 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(&r.refs, v, v+1) {
-			return true
-		}
-	}
-}
-
-// stack returns the Stack instance that owns the underlying endpoint.
-func (r *referencedNetworkEndpoint) stack() *Stack {
-	return r.nic.stack
+func (n *NIC) isValidForOutgoing(ep AssignableAddressEndpoint) bool {
+	n.mu.RLock()
+	spoofing := n.mu.spoofing
+	n.mu.RUnlock()
+	return n.Enabled() && ep.IsAssigned(spoofing)
 }
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d312a79eb..4af04846f 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -15,96 +15,39 @@
 package stack
 
 import (
-	"math"
 	"testing"
-	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-var _ LinkEndpoint = (*testLinkEndpoint)(nil)
+var _ AddressableEndpoint = (*testIPv6Endpoint)(nil)
+var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
+var _ NDPEndpoint = (*testIPv6Endpoint)(nil)
 
-// A LinkEndpoint that throws away outgoing packets.
+// An IPv6 NetworkEndpoint that throws away outgoing packets.
 //
-// We use this instead of the channel endpoint as the channel package depends on
+// We use this instead of ipv6.endpoint because the ipv6 package depends on
 // the stack package which this test lives in, causing a cyclic dependency.
-type testLinkEndpoint struct {
-	dispatcher NetworkDispatcher
-}
-
-// Attach implements LinkEndpoint.Attach.
-func (e *testLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
-	e.dispatcher = dispatcher
-}
-
-// IsAttached implements LinkEndpoint.IsAttached.
-func (e *testLinkEndpoint) IsAttached() bool {
-	return e.dispatcher != nil
-}
-
-// MTU implements LinkEndpoint.MTU.
-func (*testLinkEndpoint) MTU() uint32 {
-	return math.MaxUint16
-}
-
-// Capabilities implements LinkEndpoint.Capabilities.
-func (*testLinkEndpoint) Capabilities() LinkEndpointCapabilities {
-	return CapabilityResolutionRequired
-}
+type testIPv6Endpoint struct {
+	AddressableEndpointState
 
-// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
-func (*testLinkEndpoint) MaxHeaderLength() uint16 {
-	return 0
-}
+	nic      NetworkInterface
+	protocol *testIPv6Protocol
 
-// LinkAddress returns the link address of this endpoint.
-func (*testLinkEndpoint) LinkAddress() tcpip.LinkAddress {
-	return ""
+	invalidatedRtr tcpip.Address
 }
 
-// Wait implements LinkEndpoint.Wait.
-func (*testLinkEndpoint) Wait() {}
-
-// WritePacket implements LinkEndpoint.WritePacket.
-func (e *testLinkEndpoint) WritePacket(*Route, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error {
+func (*testIPv6Endpoint) Enable() *tcpip.Error {
 	return nil
 }
 
-// WritePackets implements LinkEndpoint.WritePackets.
-func (e *testLinkEndpoint) WritePackets(*Route, *GSO, PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	// Our tests don't use this so we don't support it.
-	return 0, tcpip.ErrNotSupported
-}
-
-// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
-func (e *testLinkEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
-	// Our tests don't use this so we don't support it.
-	return tcpip.ErrNotSupported
-}
-
-// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
-func (*testLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
-	panic("not implemented")
-}
-
-// AddHeader implements stack.LinkEndpoint.AddHeader.
-func (e *testLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	panic("not implemented")
+func (*testIPv6Endpoint) Enabled() bool {
+	return true
 }
 
-var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
-
-// An IPv6 NetworkEndpoint that throws away outgoing packets.
-//
-// We use this instead of ipv6.endpoint because the ipv6 package depends on
-// the stack package which this test lives in, causing a cyclic dependency.
-type testIPv6Endpoint struct {
-	nicID    tcpip.NICID
-	linkEP   LinkEndpoint
-	protocol *testIPv6Protocol
-}
+func (*testIPv6Endpoint) Disable() {}
 
 // DefaultTTL implements NetworkEndpoint.DefaultTTL.
 func (*testIPv6Endpoint) DefaultTTL() uint8 {
@@ -113,17 +56,12 @@ func (*testIPv6Endpoint) DefaultTTL() uint8 {
 
 // MTU implements NetworkEndpoint.MTU.
 func (e *testIPv6Endpoint) MTU() uint32 {
-	return e.linkEP.MTU() - header.IPv6MinimumSize
-}
-
-// Capabilities implements NetworkEndpoint.Capabilities.
-func (e *testIPv6Endpoint) Capabilities() LinkEndpointCapabilities {
-	return e.linkEP.Capabilities()
+	return e.nic.MTU() - header.IPv6MinimumSize
 }
 
 // MaxHeaderLength implements NetworkEndpoint.MaxHeaderLength.
 func (e *testIPv6Endpoint) MaxHeaderLength() uint16 {
-	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+	return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
 }
 
 // WritePacket implements NetworkEndpoint.WritePacket.
@@ -144,23 +82,24 @@ func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip
 	return tcpip.ErrNotSupported
 }
 
-// NICID implements NetworkEndpoint.NICID.
-func (e *testIPv6Endpoint) NICID() tcpip.NICID {
-	return e.nicID
-}
-
 // HandlePacket implements NetworkEndpoint.HandlePacket.
 func (*testIPv6Endpoint) HandlePacket(*Route, *PacketBuffer) {
 }
 
 // Close implements NetworkEndpoint.Close.
-func (*testIPv6Endpoint) Close() {}
+func (e *testIPv6Endpoint) Close() {
+	e.AddressableEndpointState.Cleanup()
+}
 
 // NetworkProtocolNumber implements NetworkEndpoint.NetworkProtocolNumber.
 func (*testIPv6Endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return header.IPv6ProtocolNumber
 }
 
+func (e *testIPv6Endpoint) InvalidateDefaultRouter(rtr tcpip.Address) {
+	e.invalidatedRtr = rtr
+}
+
 var _ NetworkProtocol = (*testIPv6Protocol)(nil)
 
 // An IPv6 NetworkProtocol that supports the bare minimum to make a stack
@@ -192,21 +131,22 @@ func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 }
 
 // NewEndpoint implements NetworkProtocol.NewEndpoint.
-func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) NetworkEndpoint {
-	return &testIPv6Endpoint{
-		nicID:    nicID,
-		linkEP:   linkEP,
+func (p *testIPv6Protocol) NewEndpoint(nic NetworkInterface, _ LinkAddressCache, _ NUDHandler, _ TransportDispatcher) NetworkEndpoint {
+	e := &testIPv6Endpoint{
+		nic:      nic,
 		protocol: p,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
 // SetOption implements NetworkProtocol.SetOption.
-func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) SetOption(tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
 // Option implements NetworkProtocol.Option.
-func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+func (*testIPv6Protocol) Option(tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	return nil
 }
 
@@ -229,7 +169,7 @@ func (*testIPv6Protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 }
 
 // LinkAddressRequest implements LinkAddressResolver.
-func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error {
+func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ tcpip.LinkAddress, _ NetworkInterface) *tcpip.Error {
 	return nil
 }
 
@@ -241,38 +181,6 @@ func (*testIPv6Protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAdd
 	return "", false
 }
 
-// Test the race condition where a NIC is removed and an RS timer fires at the
-// same time.
-func TestRemoveNICWhileHandlingRSTimer(t *testing.T) {
-	const (
-		nicID = 1
-
-		maxRtrSolicitations = 5
-	)
-
-	e := testLinkEndpoint{}
-	s := New(Options{
-		NetworkProtocols: []NetworkProtocol{&testIPv6Protocol{}},
-		NDPConfigs: NDPConfigurations{
-			MaxRtrSolicitations:     maxRtrSolicitations,
-			RtrSolicitationInterval: minimumRtrSolicitationInterval,
-		},
-	})
-
-	if err := s.CreateNIC(nicID, &e); err != nil {
-		t.Fatalf("s.CreateNIC(%d, _) = %s", nicID, err)
-	}
-
-	s.mu.Lock()
-	// Wait for the router solicitation timer to fire and block trying to obtain
-	// the stack lock when doing link address resolution.
-	time.Sleep(minimumRtrSolicitationInterval * 2)
-	if err := s.removeNICLocked(nicID); err != nil {
-		t.Fatalf("s.removeNICLocked(%d) = %s", nicID, err)
-	}
-	s.mu.Unlock()
-}
-
 func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 	// When the NIC is disabled, the only field that matters is the stats field.
 	// This test is limited to stats counter checks.
diff --git a/pkg/tcpip/stack/nud.go b/pkg/tcpip/stack/nud.go
index e1ec15487..ab629b3a4 100644
--- a/pkg/tcpip/stack/nud.go
+++ b/pkg/tcpip/stack/nud.go
@@ -129,7 +129,7 @@ type NUDDispatcher interface {
 	// the stack's operation.
 	//
 	// May be called concurrently.
-	OnNeighborAdded(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+	OnNeighborAdded(tcpip.NICID, NeighborEntry)
 
 	// OnNeighborChanged will be called when an entry in a NIC's (with ID nicID)
 	// neighbor table changes state and/or link address.
@@ -138,7 +138,7 @@ type NUDDispatcher interface {
 	// the stack's operation.
 	//
 	// May be called concurrently.
-	OnNeighborChanged(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+	OnNeighborChanged(tcpip.NICID, NeighborEntry)
 
 	// OnNeighborRemoved will be called when an entry is removed from a NIC's
 	// (with ID nicID) neighbor table.
@@ -147,7 +147,7 @@ type NUDDispatcher interface {
 	// the stack's operation.
 	//
 	// May be called concurrently.
-	OnNeighborRemoved(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+	OnNeighborRemoved(tcpip.NICID, NeighborEntry)
 }
 
 // ReachabilityConfirmationFlags describes the flags used within a reachability
@@ -177,7 +177,7 @@ type NUDHandler interface {
 	// Neighbor Solicitation for ARP or NDP, respectively). Validation of the
 	// probe needs to be performed before calling this function since the
 	// Neighbor Cache doesn't have access to view the NIC's assigned addresses.
-	HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver)
+	HandleProbe(remoteAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver)
 
 	// HandleConfirmation processes an incoming neighbor confirmation (e.g. ARP
 	// reply or Neighbor Advertisement for ARP or NDP, respectively).
diff --git a/pkg/tcpip/stack/nud_test.go b/pkg/tcpip/stack/nud_test.go
index 2494ee610..8cffb9fc6 100644
--- a/pkg/tcpip/stack/nud_test.go
+++ b/pkg/tcpip/stack/nud_test.go
@@ -60,7 +60,8 @@ func TestSetNUDConfigurationFailsForBadNICID(t *testing.T) {
 		// A neighbor cache is required to store NUDConfigurations. The networking
 		// stack will only allocate neighbor caches if a protocol providing link
 		// address resolution is specified (e.g. ARP or IPv6).
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
+		UseNeighborCache: true,
 	})
 
 	// No NIC with ID 1 yet.
@@ -84,7 +85,8 @@ func TestNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -108,7 +110,8 @@ func TestSetNUDConfigurationFailsForNotSupported(t *testing.T) {
 	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 
 	s := stack.New(stack.Options{
-		NUDConfigs: stack.DefaultNUDConfigurations(),
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -134,8 +137,9 @@ func TestDefaultNUDConfigurations(t *testing.T) {
 		// A neighbor cache is required to store NUDConfigurations. The networking
 		// stack will only allocate neighbor caches if a protocol providing link
 		// address resolution is specified (e.g. ARP or IPv6).
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 		NUDConfigs:       stack.DefaultNUDConfigurations(),
+		UseNeighborCache: true,
 	})
 	if err := s.CreateNIC(nicID, e); err != nil {
 		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -188,8 +192,9 @@ func TestNUDConfigurationsBaseReachableTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -244,8 +249,9 @@ func TestNUDConfigurationsMinRandomFactor(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -323,8 +329,9 @@ func TestNUDConfigurationsMaxRandomFactor(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -384,8 +391,9 @@ func TestNUDConfigurationsRetransmitTimer(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -435,8 +443,9 @@ func TestNUDConfigurationsDelayFirstProbeTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -486,8 +495,9 @@ func TestNUDConfigurationsMaxMulticastProbes(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -537,8 +547,9 @@ func TestNUDConfigurationsMaxUnicastProbes(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -588,8 +599,9 @@ func TestNUDConfigurationsUnreachableTime(t *testing.T) {
 				// A neighbor cache is required to store NUDConfigurations. The
 				// networking stack will only allocate neighbor caches if a protocol
 				// providing link address resolution is specified (e.g. ARP or IPv6).
-				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 				NUDConfigs:       c,
+				UseNeighborCache: true,
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 17b8beebb..7f54a6de8 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 type headerType int
@@ -80,11 +81,17 @@ type PacketBuffer struct {
 	// data are held in the same underlying buffer storage.
 	header buffer.Prependable
 
-	// NetworkProtocol is only valid when NetworkHeader is set.
+	// NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty()
+	// returns false.
 	// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
 	// numbers in registration APIs that take a PacketBuffer.
 	NetworkProtocolNumber tcpip.NetworkProtocolNumber
 
+	// TransportProtocol is only valid if it is non zero.
+	// TODO(gvisor.dev/issue/3810): This and the network protocol number should
+	// be moved into the headerinfo. This should resolve the validity issue.
+	TransportProtocolNumber tcpip.TransportProtocolNumber
+
 	// Hash is the transport layer hash of this packet. A value of zero
 	// indicates no valid hash has been set.
 	Hash uint32
@@ -234,20 +241,35 @@ func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consum
 // underlying packet payload.
 func (pk *PacketBuffer) Clone() *PacketBuffer {
 	newPk := &PacketBuffer{
-		PacketBufferEntry:     pk.PacketBufferEntry,
-		Data:                  pk.Data.Clone(nil),
-		headers:               pk.headers,
-		header:                pk.header,
-		Hash:                  pk.Hash,
-		Owner:                 pk.Owner,
-		EgressRoute:           pk.EgressRoute,
-		GSOOptions:            pk.GSOOptions,
-		NetworkProtocolNumber: pk.NetworkProtocolNumber,
-		NatDone:               pk.NatDone,
+		PacketBufferEntry:       pk.PacketBufferEntry,
+		Data:                    pk.Data.Clone(nil),
+		headers:                 pk.headers,
+		header:                  pk.header,
+		Hash:                    pk.Hash,
+		Owner:                   pk.Owner,
+		EgressRoute:             pk.EgressRoute,
+		GSOOptions:              pk.GSOOptions,
+		NetworkProtocolNumber:   pk.NetworkProtocolNumber,
+		NatDone:                 pk.NatDone,
+		TransportProtocolNumber: pk.TransportProtocolNumber,
 	}
 	return newPk
 }
 
+// Network returns the network header as a header.Network.
+//
+// Network should only be called when NetworkHeader has been set.
+func (pk *PacketBuffer) Network() header.Network {
+	switch netProto := pk.NetworkProtocolNumber; netProto {
+	case header.IPv4ProtocolNumber:
+		return header.IPv4(pk.NetworkHeader().View())
+	case header.IPv6ProtocolNumber:
+		return header.IPv6(pk.NetworkHeader().View())
+	default:
+		panic(fmt.Sprintf("unknown network protocol number %d", netProto))
+	}
+}
+
 // headerInfo stores metadata about a header in a packet.
 type headerInfo struct {
 	// buf is the memorized slice for both prepended and consumed header.
@@ -289,11 +311,25 @@ func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
 }
 
 // PayloadSince returns packet payload starting from and including a particular
-// header. This method isn't optimized and should be used in test only.
+// header.
+//
+// The returned View is owned by the caller - its backing buffer is separate
+// from the packet header's underlying packet buffer.
 func PayloadSince(h PacketHeader) buffer.View {
-	var v buffer.View
+	size := h.pk.Data.Size()
+	for _, hinfo := range h.pk.headers[h.typ:] {
+		size += len(hinfo.buf)
+	}
+
+	v := make(buffer.View, 0, size)
+
 	for _, hinfo := range h.pk.headers[h.typ:] {
 		v = append(v, hinfo.buf...)
 	}
-	return append(v, h.pk.Data.ToView()...)
+
+	for _, view := range h.pk.Data.Views() {
+		v = append(v, view...)
+	}
+
+	return v
 }
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/pending_packets.go
index 3eff141e6..f838eda8d 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/pending_packets.go
@@ -29,60 +29,60 @@ const (
 )
 
 type pendingPacket struct {
-	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
 	pkt   *PacketBuffer
 }
 
-type forwardQueue struct {
+// packetsPendingLinkResolution is a queue of packets pending link resolution.
+//
+// Once link resolution completes successfully, the packets will be written.
+type packetsPendingLinkResolution struct {
 	sync.Mutex
 
 	// The packets to send once the resolver completes.
-	packets map[<-chan struct{}][]*pendingPacket
+	packets map[<-chan struct{}][]pendingPacket
 
 	// FIFO of channels used to cancel the oldest goroutine waiting for
 	// link-address resolution.
 	cancelChans []chan struct{}
 }
 
-func newForwardQueue() *forwardQueue {
-	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
+func (f *packetsPendingLinkResolution) init() {
+	f.Lock()
+	defer f.Unlock()
+	f.packets = make(map[<-chan struct{}][]pendingPacket)
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
-	shouldWait := false
-
+func (f *packetsPendingLinkResolution) enqueue(ch <-chan struct{}, r *Route, proto tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	f.Lock()
+	defer f.Unlock()
+
 	packets, ok := f.packets[ch]
-	if !ok {
-		shouldWait = true
-	}
-	for len(packets) == maxPendingPacketsPerResolution {
+	if len(packets) == maxPendingPacketsPerResolution {
 		p := packets[0]
+		packets[0] = pendingPacket{}
 		packets = packets[1:]
-		p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+		p.route.Stats().IP.OutgoingPacketErrors.Increment()
 		p.route.Release()
 	}
+
 	if l := len(packets); l >= maxPendingPacketsPerResolution {
 		panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution))
 	}
-	f.packets[ch] = append(packets, &pendingPacket{
-		nic:   n,
+
+	f.packets[ch] = append(packets, pendingPacket{
 		route: r,
-		proto: protocol,
+		proto: proto,
 		pkt:   pkt,
 	})
-	f.Unlock()
 
-	if !shouldWait {
+	if ok {
 		return
 	}
 
 	// Wait for the link-address resolution to complete.
-	// Start a goroutine with a forwarding-cancel channel so that we can
-	// limit the maximum number of goroutines running concurrently.
-	cancel := f.newCancelChannel()
+	cancel := f.newCancelChannelLocked()
 	go func() {
 		cancelled := false
 		select {
@@ -92,17 +92,21 @@ func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tc
 		}
 
 		f.Lock()
-		packets := f.packets[ch]
+		packets, ok := f.packets[ch]
 		delete(f.packets, ch)
 		f.Unlock()
 
+		if !ok {
+			panic(fmt.Sprintf("link-resolution goroutine woke up but no entry exists in the queue of packets"))
+		}
+
 		for _, p := range packets {
 			if cancelled {
-				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+				p.route.Stats().IP.OutgoingPacketErrors.Increment()
 			} else if _, err := p.route.Resolve(nil); err != nil {
-				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+				p.route.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
-				p.nic.forwardPacket(p.route, p.proto, p.pkt)
+				p.route.nic.writePacket(p.route, nil /* gso */, p.proto, p.pkt)
 			}
 			p.route.Release()
 		}
@@ -112,12 +116,10 @@ func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tc
 // newCancelChannel creates a channel that can cancel a pending forwarding
 // activity. The oldest channel is closed if the number of open channels would
 // exceed maxPendingResolutions.
-func (f *forwardQueue) newCancelChannel() chan struct{} {
-	f.Lock()
-	defer f.Unlock()
-
+func (f *packetsPendingLinkResolution) newCancelChannelLocked() chan struct{} {
 	if len(f.cancelChans) == maxPendingResolutions {
 		ch := f.cancelChans[0]
+		f.cancelChans[0] = nil
 		f.cancelChans = f.cancelChans[1:]
 		close(ch)
 	}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index aca2f77f8..203f3b51f 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -15,6 +15,8 @@
 package stack
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -125,6 +127,26 @@ type PacketEndpoint interface {
 	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
+// UnknownDestinationPacketDisposition enumerates the possible return vaues from
+// HandleUnknownDestinationPacket().
+type UnknownDestinationPacketDisposition int
+
+const (
+	// UnknownDestinationPacketMalformed denotes that the packet was malformed
+	// and no further processing should be attempted other than updating
+	// statistics.
+	UnknownDestinationPacketMalformed UnknownDestinationPacketDisposition = iota
+
+	// UnknownDestinationPacketUnhandled tells the caller that the packet was
+	// well formed but that the issue was not handled and the stack should take
+	// the default action.
+	UnknownDestinationPacketUnhandled
+
+	// UnknownDestinationPacketHandled tells the caller that it should do
+	// no further processing.
+	UnknownDestinationPacketHandled
+)
+
 // TransportProtocol is the interface that needs to be implemented by transport
 // protocols (e.g., tcp, udp) that want to be part of the networking stack.
 type TransportProtocol interface {
@@ -132,10 +154,10 @@ type TransportProtocol interface {
 	Number() tcpip.TransportProtocolNumber
 
 	// NewEndpoint creates a new endpoint of the transport protocol.
-	NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+	NewEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 
 	// NewRawEndpoint creates a new raw endpoint of the transport protocol.
-	NewRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+	NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 
 	// MinimumPacketSize returns the minimum valid packet size of this
 	// transport protocol. The stack automatically drops any packets smaller
@@ -147,24 +169,22 @@ type TransportProtocol interface {
 	ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 	// HandleUnknownDestinationPacket handles packets targeted at this
-	// protocol but that don't match any existing endpoint. For example,
-	// it is targeted at a port that have no listeners.
-	//
-	// The return value indicates whether the packet was well-formed (for
-	// stats purposes only).
+	// protocol that don't match any existing endpoint. For example,
+	// it is targeted at a port that has no listeners.
 	//
-	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
+	// HandleUnknownDestinationPacket takes ownership of pkt if it handles
+	// the issue.
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) UnknownDestinationPacketDisposition
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
@@ -179,6 +199,25 @@ type TransportProtocol interface {
 	Parse(pkt *PacketBuffer) (ok bool)
 }
 
+// TransportPacketDisposition is the result from attempting to deliver a packet
+// to the transport layer.
+type TransportPacketDisposition int
+
+const (
+	// TransportPacketHandled indicates that a transport packet was handled by the
+	// transport layer and callers need not take any further action.
+	TransportPacketHandled TransportPacketDisposition = iota
+
+	// TransportPacketProtocolUnreachable indicates that the transport
+	// protocol requested in the packet is not supported.
+	TransportPacketProtocolUnreachable
+
+	// TransportPacketDestinationPortUnreachable indicates that there weren't any
+	// listeners interested in the packet and the transport protocol has no means
+	// to notify the sender.
+	TransportPacketDestinationPortUnreachable
+)
+
 // TransportDispatcher contains the methods used by the network stack to deliver
 // packets to the appropriate transport endpoint after it has been handled by
 // the network layer.
@@ -189,7 +228,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -226,9 +265,255 @@ type NetworkHeaderParams struct {
 	TOS uint8
 }
 
+// GroupAddressableEndpoint is an endpoint that supports group addressing.
+//
+// An endpoint is considered to support group addressing when one or more
+// endpoints may associate themselves with the same identifier (group address).
+type GroupAddressableEndpoint interface {
+	// JoinGroup joins the spcified group.
+	//
+	// Returns true if the group was newly joined.
+	JoinGroup(group tcpip.Address) (bool, *tcpip.Error)
+
+	// LeaveGroup attempts to leave the specified group.
+	//
+	// Returns tcpip.ErrBadLocalAddress if the endpoint has not joined the group.
+	LeaveGroup(group tcpip.Address) (bool, *tcpip.Error)
+
+	// IsInGroup returns true if the endpoint is a member of the specified group.
+	IsInGroup(group tcpip.Address) bool
+}
+
+// PrimaryEndpointBehavior is an enumeration of an AddressEndpoint's primary
+// behavior.
+type PrimaryEndpointBehavior int
+
+const (
+	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
+	// endpoint for new connections with no local address. This is the
+	// default when calling NIC.AddAddress.
+	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
+
+	// FirstPrimaryEndpoint indicates the endpoint should be the first
+	// primary endpoint considered. If there are multiple endpoints with
+	// this behavior, they are ordered by recency.
+	FirstPrimaryEndpoint
+
+	// NeverPrimaryEndpoint indicates the endpoint should never be a
+	// primary endpoint.
+	NeverPrimaryEndpoint
+)
+
+// AddressConfigType is the method used to add an address.
+type AddressConfigType int
+
+const (
+	// AddressConfigStatic is a statically configured address endpoint that was
+	// added by some user-specified action (adding an explicit address, joining a
+	// multicast group).
+	AddressConfigStatic AddressConfigType = iota
+
+	// AddressConfigSlaac is an address endpoint added by SLAAC, as per RFC 4862
+	// section 5.5.3.
+	AddressConfigSlaac
+
+	// AddressConfigSlaacTemp is a temporary address endpoint added by SLAAC as
+	// per RFC 4941. Temporary SLAAC addresses are short-lived and are not
+	// to be valid (or preferred) forever; hence the term temporary.
+	AddressConfigSlaacTemp
+)
+
+// AssignableAddressEndpoint is a reference counted address endpoint that may be
+// assigned to a NetworkEndpoint.
+type AssignableAddressEndpoint interface {
+	// AddressWithPrefix returns the endpoint's address.
+	AddressWithPrefix() tcpip.AddressWithPrefix
+
+	// IsAssigned returns whether or not the endpoint is considered bound
+	// to its NetworkEndpoint.
+	IsAssigned(allowExpired bool) bool
+
+	// IncRef increments this endpoint's reference count.
+	//
+	// Returns true if it was successfully incremented. If it returns false, then
+	// the endpoint is considered expired and should no longer be used.
+	IncRef() bool
+
+	// DecRef decrements this endpoint's reference count.
+	DecRef()
+}
+
+// AddressEndpoint is an endpoint representing an address assigned to an
+// AddressableEndpoint.
+type AddressEndpoint interface {
+	AssignableAddressEndpoint
+
+	// GetKind returns the address kind for this endpoint.
+	GetKind() AddressKind
+
+	// SetKind sets the address kind for this endpoint.
+	SetKind(AddressKind)
+
+	// ConfigType returns the method used to add the address.
+	ConfigType() AddressConfigType
+
+	// Deprecated returns whether or not this endpoint is deprecated.
+	Deprecated() bool
+
+	// SetDeprecated sets this endpoint's deprecated status.
+	SetDeprecated(bool)
+}
+
+// AddressKind is the kind of of an address.
+//
+// See the values of AddressKind for more details.
+type AddressKind int
+
+const (
+	// PermanentTentative is a permanent address endpoint that is not yet
+	// considered to be fully bound to an interface in the traditional
+	// sense. That is, the address is associated with a NIC, but packets
+	// destined to the address MUST NOT be accepted and MUST be silently
+	// dropped, and the address MUST NOT be used as a source address for
+	// outgoing packets. For IPv6, addresses are of this kind until NDP's
+	// Duplicate Address Detection (DAD) resolves. If DAD fails, the address
+	// is removed.
+	PermanentTentative AddressKind = iota
+
+	// Permanent is a permanent endpoint (vs. a temporary one) assigned to the
+	// NIC. Its reference count is biased by 1 to avoid removal when no route
+	// holds a reference to it. It is removed by explicitly removing the address
+	// from the NIC.
+	Permanent
+
+	// PermanentExpired is a permanent endpoint that had its address removed from
+	// the NIC, and it is waiting to be removed once no references to it are held.
+	//
+	// If the address is re-added before the endpoint is removed, its type
+	// changes back to Permanent.
+	PermanentExpired
+
+	// Temporary is an endpoint, created on a one-off basis to temporarily
+	// consider the NIC bound an an address that it is not explictiy bound to
+	// (such as a permanent address). Its reference count must not be biased by 1
+	// so that the address is removed immediately when references to it are no
+	// longer held.
+	//
+	// A temporary endpoint may be promoted to permanent if the address is added
+	// permanently.
+	Temporary
+)
+
+// IsPermanent returns true if the AddressKind represents a permanent address.
+func (k AddressKind) IsPermanent() bool {
+	switch k {
+	case Permanent, PermanentTentative:
+		return true
+	case Temporary, PermanentExpired:
+		return false
+	default:
+		panic(fmt.Sprintf("unrecognized address kind = %d", k))
+	}
+}
+
+// AddressableEndpoint is an endpoint that supports addressing.
+//
+// An endpoint is considered to support addressing when the endpoint may
+// associate itself with an identifier (address).
+type AddressableEndpoint interface {
+	// AddAndAcquirePermanentAddress adds the passed permanent address.
+	//
+	// Returns tcpip.ErrDuplicateAddress if the address exists.
+	//
+	// Acquires and returns the AddressEndpoint for the added address.
+	AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, *tcpip.Error)
+
+	// RemovePermanentAddress removes the passed address if it is a permanent
+	// address.
+	//
+	// Returns tcpip.ErrBadLocalAddress if the endpoint does not have the passed
+	// permanent address.
+	RemovePermanentAddress(addr tcpip.Address) *tcpip.Error
+
+	// MainAddress returns the endpoint's primary permanent address.
+	MainAddress() tcpip.AddressWithPrefix
+
+	// AcquireAssignedAddress returns an address endpoint for the passed address
+	// that is considered bound to the endpoint, optionally creating a temporary
+	// endpoint if requested and no existing address exists.
+	//
+	// The returned endpoint's reference count is incremented.
+	//
+	// Returns nil if the specified address is not local to this endpoint.
+	AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint
+
+	// AcquireOutgoingPrimaryAddress returns a primary address that may be used as
+	// a source address when sending packets to the passed remote address.
+	//
+	// If allowExpired is true, expired addresses may be returned.
+	//
+	// The returned endpoint's reference count is incremented.
+	//
+	// Returns nil if a primary address is not available.
+	AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint
+
+	// PrimaryAddresses returns the primary addresses.
+	PrimaryAddresses() []tcpip.AddressWithPrefix
+
+	// PermanentAddresses returns all the permanent addresses.
+	PermanentAddresses() []tcpip.AddressWithPrefix
+}
+
+// NDPEndpoint is a network endpoint that supports NDP.
+type NDPEndpoint interface {
+	NetworkEndpoint
+
+	// InvalidateDefaultRouter invalidates a default router discovered through
+	// NDP.
+	InvalidateDefaultRouter(tcpip.Address)
+}
+
+// NetworkInterface is a network interface.
+type NetworkInterface interface {
+	NetworkLinkEndpoint
+
+	// ID returns the interface's ID.
+	ID() tcpip.NICID
+
+	// IsLoopback returns true if the interface is a loopback interface.
+	IsLoopback() bool
+
+	// Name returns the name of the interface.
+	//
+	// May return an empty string if the interface is not configured with a name.
+	Name() string
+
+	// Enabled returns true if the interface is enabled.
+	Enabled() bool
+
+	// WritePacketToRemote writes the packet to the given remote link address.
+	WritePacketToRemote(tcpip.LinkAddress, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error
+}
+
 // NetworkEndpoint is the interface that needs to be implemented by endpoints
 // of network layer protocols (e.g., ipv4, ipv6).
 type NetworkEndpoint interface {
+	AddressableEndpoint
+
+	// Enable enables the endpoint.
+	//
+	// Must only be called when the stack is in a state that allows the endpoint
+	// to send and receive packets.
+	//
+	// Returns tcpip.ErrNotPermitted if the endpoint cannot be enabled.
+	Enable() *tcpip.Error
+
+	// Enabled returns true if the endpoint is enabled.
+	Enabled() bool
+
+	// Disable disables the endpoint.
+	Disable()
+
 	// DefaultTTL is the default time-to-live value (or hop limit, in ipv6)
 	// for this endpoint.
 	DefaultTTL() uint8
@@ -238,10 +523,6 @@ type NetworkEndpoint interface {
 	// minus the network endpoint max header length.
 	MTU() uint32
 
-	// Capabilities returns the set of capabilities supported by the
-	// underlying link-layer endpoint.
-	Capabilities() LinkEndpointCapabilities
-
 	// MaxHeaderLength returns the maximum size the network (and lower
 	// level layers combined) headers can have. Higher levels use this
 	// information to reserve space in the front of the packets they're
@@ -262,9 +543,6 @@ type NetworkEndpoint interface {
 	// header to the given destination address. It takes ownership of pkt.
 	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
-	// NICID returns the id of the NIC this endpoint belongs to.
-	NICID() tcpip.NICID
-
 	// HandlePacket is called by the link layer when new packets arrive to
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
@@ -279,6 +557,17 @@ type NetworkEndpoint interface {
 	NetworkProtocolNumber() tcpip.NetworkProtocolNumber
 }
 
+// ForwardingNetworkProtocol is a NetworkProtocol that may forward packets.
+type ForwardingNetworkProtocol interface {
+	NetworkProtocol
+
+	// Forwarding returns the forwarding configuration.
+	Forwarding() bool
+
+	// SetForwarding sets the forwarding configuration.
+	SetForwarding(bool)
+}
+
 // NetworkProtocol is the interface that needs to be implemented by network
 // protocols (e.g., ipv4, ipv6) that want to be part of the networking stack.
 type NetworkProtocol interface {
@@ -298,17 +587,17 @@ type NetworkProtocol interface {
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
 	// NewEndpoint creates a new endpoint of this protocol.
-	NewEndpoint(nicID tcpip.NICID, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) NetworkEndpoint
+	NewEndpoint(nic NetworkInterface, linkAddrCache LinkAddressCache, nud NUDHandler, dispatcher TransportDispatcher) NetworkEndpoint
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
 	// provided option value is invalid.
-	SetOption(option interface{}) *tcpip.Error
+	SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error
 
 	// Option allows retrieving protocol specific option values.
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
-	Option(option interface{}) *tcpip.Error
+	Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error
 
 	// Close requests that any worker goroutines owned by the protocol
 	// stop.
@@ -376,22 +665,15 @@ const (
 	CapabilitySoftwareGSO
 )
 
-// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
-// ethernet, loopback, raw) and used by network layer protocols to send packets
-// out through the implementer's data link endpoint. When a link header exists,
-// it sets each PacketBuffer's LinkHeader field before passing it up the
-// stack.
-type LinkEndpoint interface {
+// NetworkLinkEndpoint is a data-link layer that supports sending network
+// layer packets.
+type NetworkLinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
 	// usually dictated by the backing physical network; when such a
 	// physical network doesn't exist, the limit is generally 64k, which
 	// includes the maximum size of an IP packet.
 	MTU() uint32
 
-	// Capabilities returns the set of capabilities supported by the
-	// endpoint.
-	Capabilities() LinkEndpointCapabilities
-
 	// MaxHeaderLength returns the maximum size the data link (and
 	// lower level layers combined) headers can have. Higher levels use this
 	// information to reserve space in the front of the packets they're
@@ -399,7 +681,7 @@ type LinkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// LinkAddress returns the link address (typically a MAC) of the
-	// link endpoint.
+	// endpoint.
 	LinkAddress() tcpip.LinkAddress
 
 	// WritePacket writes a packet with the given protocol through the
@@ -419,6 +701,19 @@ type LinkEndpoint interface {
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+}
+
+// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
+// ethernet, loopback, raw) and used by network layer protocols to send packets
+// out through the implementer's data link endpoint. When a link header exists,
+// it sets each PacketBuffer's LinkHeader field before passing it up the
+// stack.
+type LinkEndpoint interface {
+	NetworkLinkEndpoint
+
+	// Capabilities returns the set of capabilities supported by the
+	// endpoint.
+	Capabilities() LinkEndpointCapabilities
 
 	// WriteRawPacket writes a packet directly to the link. The packet
 	// should already have an ethernet header. It takes ownership of vv.
@@ -427,8 +722,8 @@ type LinkEndpoint interface {
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
 	//
-	// Attach will be called with a nil dispatcher if the receiver's associated
-	// NIC is being removed.
+	// Attach is called with a nil dispatcher when the endpoint's NIC is being
+	// removed.
 	Attach(dispatcher NetworkDispatcher)
 
 	// IsAttached returns whether a NetworkDispatcher is attached to the
@@ -472,13 +767,13 @@ type InjectableLinkEndpoint interface {
 // A LinkAddressResolver is an extension to a NetworkProtocol that
 // can resolve link addresses.
 type LinkAddressResolver interface {
-	// LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts
-	// the request on the local network if remoteLinkAddr is the zero value. The
-	// request is sent on linkEP with localAddr as the source.
+	// LinkAddressRequest sends a request for the link address of the target
+	// address. The request is broadcasted on the local network if a remote link
+	// address is not provided.
 	//
-	// A valid response will cause the discovery protocol's network
-	// endpoint to call AddLinkAddress.
-	LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error
+	// The request is sent from the passed network interface. If the interface
+	// local address is unspecified, any interface local address may be used.
+	LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, nic NetworkInterface) *tcpip.Error
 
 	// ResolveStaticAddress attempts to resolve address without sending
 	// requests. It either resolves the name immediately or returns the
@@ -488,7 +783,7 @@ type LinkAddressResolver interface {
 	ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)
 
 	// LinkAddressProtocol returns the network protocol of the
-	// addresses this this resolver can resolve.
+	// addresses this resolver can resolve.
 	LinkAddressProtocol() tcpip.NetworkProtocolNumber
 }
 
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index e267bebb0..b76e2d37b 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -42,21 +42,27 @@ type Route struct {
 	// NetProto is the network-layer protocol.
 	NetProto tcpip.NetworkProtocolNumber
 
-	// ref a reference to the network endpoint through which the route
-	// starts.
-	ref *referencedNetworkEndpoint
-
 	// Loop controls where WritePacket should send packets.
 	Loop PacketLooping
 
-	// directedBroadcast indicates whether this route is sending a directed
-	// broadcast packet.
-	directedBroadcast bool
+	// nic is the NIC the route goes through.
+	nic *NIC
+
+	// addressEndpoint is the local address this route is associated with.
+	addressEndpoint AssignableAddressEndpoint
+
+	// linkCache is set if link address resolution is enabled for this protocol on
+	// the route's NIC.
+	linkCache LinkAddressCache
+
+	// linkRes is set if link address resolution is enabled for this protocol on
+	// the route's NIC.
+	linkRes LinkAddressResolver
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
-// reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, handleLocal, multicastLoop bool) Route {
+// AssignableAddressEndpoint.
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, nic *NIC, addressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) Route {
 	loop := PacketOut
 	if handleLocal && localAddr != "" && remoteAddr == localAddr {
 		loop = PacketLoop
@@ -66,29 +72,39 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip
 		loop |= PacketLoop
 	}
 
-	return Route{
+	r := Route{
 		NetProto:         netProto,
 		LocalAddress:     localAddr,
-		LocalLinkAddress: localLinkAddr,
+		LocalLinkAddress: nic.LinkEndpoint.LinkAddress(),
 		RemoteAddress:    remoteAddr,
-		ref:              ref,
+		addressEndpoint:  addressEndpoint,
+		nic:              nic,
 		Loop:             loop,
 	}
+
+	if r.nic.LinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 {
+		if linkRes, ok := r.nic.stack.linkAddrResolvers[r.NetProto]; ok {
+			r.linkRes = linkRes
+			r.linkCache = r.nic.stack
+		}
+	}
+
+	return r
 }
 
 // NICID returns the id of the NIC from which this route originates.
 func (r *Route) NICID() tcpip.NICID {
-	return r.ref.ep.NICID()
+	return r.nic.ID()
 }
 
 // MaxHeaderLength forwards the call to the network endpoint's implementation.
 func (r *Route) MaxHeaderLength() uint16 {
-	return r.ref.ep.MaxHeaderLength()
+	return r.nic.getNetworkEndpoint(r.NetProto).MaxHeaderLength()
 }
 
 // Stats returns a mutable copy of current stats.
 func (r *Route) Stats() tcpip.Stats {
-	return r.ref.nic.stack.Stats()
+	return r.nic.stack.Stats()
 }
 
 // PseudoHeaderChecksum forwards the call to the network endpoint's
@@ -99,12 +115,12 @@ func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, tot
 
 // Capabilities returns the link-layer capabilities of the route.
 func (r *Route) Capabilities() LinkEndpointCapabilities {
-	return r.ref.ep.Capabilities()
+	return r.nic.LinkEndpoint.Capabilities()
 }
 
 // GSOMaxSize returns the maximum GSO packet size.
 func (r *Route) GSOMaxSize() uint32 {
-	if gso, ok := r.ref.ep.(GSOEndpoint); ok {
+	if gso, ok := r.nic.LinkEndpoint.(GSOEndpoint); ok {
 		return gso.GSOMaxSize()
 	}
 	return 0
@@ -141,7 +157,17 @@ func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 		}
 		nextAddr = r.RemoteAddress
 	}
-	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
+
+	if neigh := r.nic.neigh; neigh != nil {
+		entry, ch, err := neigh.entry(nextAddr, r.LocalAddress, r.linkRes, waker)
+		if err != nil {
+			return ch, err
+		}
+		r.RemoteLinkAddress = entry.LinkAddr
+		return nil, nil
+	}
+
+	linkAddr, ch, err := r.linkCache.GetLinkAddress(r.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
 	if err != nil {
 		return ch, err
 	}
@@ -155,7 +181,13 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 	if nextAddr == "" {
 		nextAddr = r.RemoteAddress
 	}
-	r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker)
+
+	if neigh := r.nic.neigh; neigh != nil {
+		neigh.removeWaker(nextAddr, waker)
+		return
+	}
+
+	r.linkCache.RemoveWaker(r.nic.ID(), nextAddr, waker)
 }
 
 // IsResolutionRequired returns true if Resolve() must be called to resolve
@@ -163,101 +195,63 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 //
 // The NIC r uses must not be locked.
 func (r *Route) IsResolutionRequired() bool {
-	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
+	if r.nic.neigh != nil {
+		return r.nic.isValidForOutgoing(r.addressEndpoint) && r.linkRes != nil && r.RemoteLinkAddress == ""
+	}
+	return r.nic.isValidForOutgoing(r.addressEndpoint) && r.linkCache != nil && r.RemoteLinkAddress == ""
 }
 
 // WritePacket writes the packet through the given route.
 func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// WritePacket takes ownership of pkt, calculate numBytes first.
-	numBytes := pkt.Size()
-
-	err := r.ref.ep.WritePacket(r, gso, params, pkt)
-	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-	} else {
-		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-	}
-	return err
+	return r.nic.getNetworkEndpoint(r.NetProto).WritePacket(r, gso, params, pkt)
 }
 
 // WritePackets writes a list of n packets through the given route and returns
 // the number of packets written.
 func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
-	// WritePackets takes ownership of pkt, calculate length first.
-	numPkts := pkts.Len()
-
-	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
-	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
-	}
-	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
-
-	writtenBytes := 0
-	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
-		writtenBytes += pb.Size()
-	}
-
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
-	return n, err
+	return r.nic.getNetworkEndpoint(r.NetProto).WritePackets(r, gso, pkts, params)
 }
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
 func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
-	if !r.ref.isValidForOutgoing() {
+	if !r.nic.isValidForOutgoing(r.addressEndpoint) {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
-	numBytes := pkt.Data.Size()
-
-	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
-		r.Stats().IP.OutgoingPacketErrors.Increment()
-		return err
-	}
-	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
-	return nil
+	return r.nic.getNetworkEndpoint(r.NetProto).WriteHeaderIncludedPacket(r, pkt)
 }
 
 // DefaultTTL returns the default TTL of the underlying network endpoint.
 func (r *Route) DefaultTTL() uint8 {
-	return r.ref.ep.DefaultTTL()
+	return r.nic.getNetworkEndpoint(r.NetProto).DefaultTTL()
 }
 
 // MTU returns the MTU of the underlying network endpoint.
 func (r *Route) MTU() uint32 {
-	return r.ref.ep.MTU()
-}
-
-// NetworkProtocolNumber returns the NetworkProtocolNumber of the underlying
-// network endpoint.
-func (r *Route) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
-	return r.ref.ep.NetworkProtocolNumber()
+	return r.nic.getNetworkEndpoint(r.NetProto).MTU()
 }
 
 // Release frees all resources associated with the route.
 func (r *Route) Release() {
-	if r.ref != nil {
-		r.ref.decRef()
-		r.ref = nil
+	if r.addressEndpoint != nil {
+		r.addressEndpoint.DecRef()
+		r.addressEndpoint = nil
 	}
 }
 
-// Clone Clone a route such that the original one can be released and the new
-// one will remain valid.
+// Clone clones the route.
 func (r *Route) Clone() Route {
-	if r.ref != nil {
-		r.ref.incRef()
+	if r.addressEndpoint != nil {
+		_ = r.addressEndpoint.IncRef()
 	}
 	return *r
 }
@@ -281,27 +275,30 @@ func (r *Route) MakeLoopedRoute() Route {
 
 // Stack returns the instance of the Stack that owns this route.
 func (r *Route) Stack() *Stack {
-	return r.ref.stack()
+	return r.nic.stack
+}
+
+func (r *Route) isV4Broadcast(addr tcpip.Address) bool {
+	if addr == header.IPv4Broadcast {
+		return true
+	}
+
+	subnet := r.addressEndpoint.AddressWithPrefix().Subnet()
+	return subnet.IsBroadcast(addr)
 }
 
 // IsOutboundBroadcast returns true if the route is for an outbound broadcast
 // packet.
 func (r *Route) IsOutboundBroadcast() bool {
 	// Only IPv4 has a notion of broadcast.
-	return r.directedBroadcast || r.RemoteAddress == header.IPv4Broadcast
+	return r.isV4Broadcast(r.RemoteAddress)
 }
 
 // IsInboundBroadcast returns true if the route is for an inbound broadcast
 // packet.
 func (r *Route) IsInboundBroadcast() bool {
 	// Only IPv4 has a notion of broadcast.
-	if r.LocalAddress == header.IPv4Broadcast {
-		return true
-	}
-
-	addr := r.ref.addrWithPrefix()
-	subnet := addr.Subnet()
-	return subnet.IsBroadcast(r.LocalAddress)
+	return r.isV4Broadcast(r.LocalAddress)
 }
 
 // ReverseRoute returns new route with given source and destination address.
@@ -312,7 +309,10 @@ func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
 		LocalLinkAddress:  r.RemoteLinkAddress,
 		RemoteAddress:     src,
 		RemoteLinkAddress: r.LocalLinkAddress,
-		ref:               r.ref,
 		Loop:              r.Loop,
+		addressEndpoint:   r.addressEndpoint,
+		nic:               r.nic,
+		linkCache:         r.linkCache,
+		linkRes:           r.linkRes,
 	}
 }
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a3f87c8af..e8f1c110e 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -144,10 +144,7 @@ type TCPReceiverState struct {
 
 	// PendingBufUsed is the number of bytes pending in the receive
 	// queue.
-	PendingBufUsed seqnum.Size
-
-	// PendingBufSize is the size of the socket receive buffer.
-	PendingBufSize seqnum.Size
+	PendingBufUsed int
 }
 
 // TCPSenderState holds a copy of the internal state of the sender for
@@ -248,7 +245,7 @@ type RcvBufAutoTuneParams struct {
 	// was started.
 	MeasureTime time.Time
 
-	// CopiedBytes is the number of bytes copied to userspace since
+	// CopiedBytes is the number of bytes copied to user space since
 	// this measure began.
 	CopiedBytes int
 
@@ -366,38 +363,6 @@ func (u *uniqueIDGenerator) UniqueID() uint64 {
 	return atomic.AddUint64((*uint64)(u), 1)
 }
 
-// NICNameFromID is a function that returns a stable name for the specified NIC,
-// even if different NIC IDs are used to refer to the same NIC in different
-// program runs. It is used when generating opaque interface identifiers (IIDs).
-// If the NIC was created with a name, it will be passed to NICNameFromID.
-//
-// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
-// generated for the same prefix on differnt NICs.
-type NICNameFromID func(tcpip.NICID, string) string
-
-// OpaqueInterfaceIdentifierOptions holds the options related to the generation
-// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
-type OpaqueInterfaceIdentifierOptions struct {
-	// NICNameFromID is a function that returns a stable name for a specified NIC,
-	// even if the NIC ID changes over time.
-	//
-	// Must be specified to generate the opaque IID.
-	NICNameFromID NICNameFromID
-
-	// SecretKey is a pseudo-random number used as the secret key when generating
-	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
-	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
-	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
-	// change between program runs, unless explicitly changed.
-	//
-	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
-	// MUST NOT be modified after Stack is created.
-	//
-	// May be nil, but a nil value is highly discouraged to maintain
-	// some level of randomness between nodes.
-	SecretKey []byte
-}
-
 // Stack is a networking stack, with all supported protocols, NICs, and route
 // table.
 type Stack struct {
@@ -415,10 +380,12 @@ type Stack struct {
 
 	linkAddrCache *linkAddrCache
 
-	mu               sync.RWMutex
-	nics             map[tcpip.NICID]*NIC
-	forwarding       bool
-	cleanupEndpoints map[TransportEndpoint]struct{}
+	mu   sync.RWMutex
+	nics map[tcpip.NICID]*NIC
+
+	// cleanupEndpointsMu protects cleanupEndpoints.
+	cleanupEndpointsMu sync.Mutex
+	cleanupEndpoints   map[TransportEndpoint]struct{}
 
 	// route is the route table passed in by the user via SetRouteTable(),
 	// it is used by FindRoute() to build a route for a specific
@@ -429,7 +396,7 @@ type Stack struct {
 
 	// If not nil, then any new endpoints will have this probe function
 	// invoked everytime they receive a TCP segment.
-	tcpProbeFunc TCPProbeFunc
+	tcpProbeFunc atomic.Value // TCPProbeFunc
 
 	// clock is used to generate user-visible times.
 	clock tcpip.Clock
@@ -455,20 +422,12 @@ type Stack struct {
 	// TODO(gvisor.dev/issue/940): S/R this field.
 	seed uint32
 
-	// ndpConfigs is the default NDP configurations used by interfaces.
-	ndpConfigs NDPConfigurations
-
 	// nudConfigs is the default NUD configurations used by interfaces.
 	nudConfigs NUDConfigurations
 
-	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
-	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
-	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
-	autoGenIPv6LinkLocal bool
-
-	// ndpDisp is the NDP event dispatcher that is used to send the netstack
-	// integrator NDP related events.
-	ndpDisp NDPDispatcher
+	// useNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the NIC's neighborCache instead of linkAddrCache.
+	useNeighborCache bool
 
 	// nudDisp is the NUD event dispatcher that is used to send the netstack
 	// integrator NUD related events.
@@ -477,17 +436,9 @@ type Stack struct {
 	// uniqueIDGenerator is a generator of unique identifiers.
 	uniqueIDGenerator UniqueID
 
-	// opaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
-	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
-
-	// tempIIDSeed is used to seed the initial temporary interface identifier
-	// history value used to generate IIDs for temporary SLAAC addresses.
-	tempIIDSeed []byte
-
-	// forwarder holds the packets that wait for their link-address resolutions
-	// to complete, and forwards them when each resolution is done.
-	forwarder *forwardQueue
+	// linkResQueue holds packets that are waiting for link resolution to
+	// complete.
+	linkResQueue packetsPendingLinkResolution
 
 	// randomGenerator is an injectable pseudo random generator that can be
 	// used when a random number is required.
@@ -507,13 +458,25 @@ type UniqueID interface {
 	UniqueID() uint64
 }
 
+// NetworkProtocolFactory instantiates a network protocol.
+//
+// NetworkProtocolFactory must not attempt to modify the stack, it may only
+// query the stack.
+type NetworkProtocolFactory func(*Stack) NetworkProtocol
+
+// TransportProtocolFactory instantiates a transport protocol.
+//
+// TransportProtocolFactory must not attempt to modify the stack, it may only
+// query the stack.
+type TransportProtocolFactory func(*Stack) TransportProtocol
+
 // Options contains optional Stack configuration.
 type Options struct {
 	// NetworkProtocols lists the network protocols to enable.
-	NetworkProtocols []NetworkProtocol
+	NetworkProtocols []NetworkProtocolFactory
 
 	// TransportProtocols lists the transport protocols to enable.
-	TransportProtocols []TransportProtocol
+	TransportProtocols []TransportProtocolFactory
 
 	// Clock is an optional clock source used for timestampping packets.
 	//
@@ -531,33 +494,15 @@ type Options struct {
 	// UniqueID is an optional generator of unique identifiers.
 	UniqueID UniqueID
 
-	// NDPConfigs is the default NDP configurations used by interfaces.
-	//
-	// By default, NDPConfigs will have a zero value for its
-	// DupAddrDetectTransmits field, implying that DAD will not be performed
-	// before assigning an address to a NIC.
-	NDPConfigs NDPConfigurations
-
 	// NUDConfigs is the default NUD configurations used by interfaces.
 	NUDConfigs NUDConfigurations
 
-	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
-	// auto-generate an IPv6 link-local address for newly enabled non-loopback
-	// NICs.
-	//
-	// Note, setting this to true does not mean that a link-local address
-	// will be assigned right away, or at all. If Duplicate Address Detection
-	// is enabled, an address will only be assigned if it successfully resolves.
-	// If it fails, no further attempt will be made to auto-generate an IPv6
-	// link-local address.
-	//
-	// The generated link-local address will follow RFC 4291 Appendix A
-	// guidelines.
-	AutoGenIPv6LinkLocal bool
-
-	// NDPDisp is the NDP event dispatcher that an integrator can provide to
-	// receive NDP related events.
-	NDPDisp NDPDispatcher
+	// UseNeighborCache indicates whether ARP and NDP packets should be handled
+	// by the Neighbor Unreachability Detection (NUD) state machine. This flag
+	// also enables the APIs for inspecting and modifying the neighbor table via
+	// NUDDispatcher and the following Stack methods: Neighbors, RemoveNeighbor,
+	// and ClearNeighbors.
+	UseNeighborCache bool
 
 	// NUDDisp is the NUD event dispatcher that an integrator can provide to
 	// receive NUD related events.
@@ -567,31 +512,12 @@ type Options struct {
 	// this is non-nil.
 	RawFactory RawFactory
 
-	// OpaqueIIDOpts hold the options for generating opaque interface
-	// identifiers (IIDs) as outlined by RFC 7217.
-	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
-
 	// RandSource is an optional source to use to generate random
 	// numbers. If omitted it defaults to a Source seeded by the data
 	// returned by rand.Read().
 	//
 	// RandSource must be thread-safe.
 	RandSource mathrand.Source
-
-	// TempIIDSeed is used to seed the initial temporary interface identifier
-	// history value used to generate IIDs for temporary SLAAC addresses.
-	//
-	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
-	// and random from the perspective of other nodes on the network. It is
-	// recommended that the seed be a random byte buffer of at least
-	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
-	// sufficiently random. It should follow minimum randomness requirements for
-	// security as outlined by RFC 4086.
-	//
-	// Note: using a nil value, the same seed across netstack program runs, or a
-	// seed that is too small would reduce randomness and increase predictability,
-	// defeating the purpose of temporary SLAAC addresses.
-	TempIIDSeed []byte
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -624,8 +550,8 @@ type TransportEndpointInfo struct {
 // incompatible with the receiver.
 //
 // Preconditon: the parent endpoint mu must be held while calling this method.
-func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
+func (t *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := t.NetProto
 	switch len(addr.Addr) {
 	case header.IPv4AddressSize:
 		netProto = header.IPv4ProtocolNumber
@@ -639,7 +565,7 @@ func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6onl
 		}
 	}
 
-	switch len(e.ID.LocalAddress) {
+	switch len(t.ID.LocalAddress) {
 	case header.IPv4AddressSize:
 		if len(addr.Addr) == header.IPv6AddressSize {
 			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
@@ -651,8 +577,8 @@ func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6onl
 	}
 
 	switch {
-	case netProto == e.NetProto:
-	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+	case netProto == t.NetProto:
+	case netProto == header.IPv4ProtocolNumber && t.NetProto == header.IPv6ProtocolNumber:
 		if v6only {
 			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
 		}
@@ -694,35 +620,27 @@ func New(opts Options) *Stack {
 		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
 	}
 
-	// Make sure opts.NDPConfigs contains valid values only.
-	opts.NDPConfigs.validate()
-
 	opts.NUDConfigs.resetInvalidFields()
 
 	s := &Stack{
-		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
-		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
-		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
-		nics:                 make(map[tcpip.NICID]*NIC),
-		cleanupEndpoints:     make(map[TransportEndpoint]struct{}),
-		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
-		PortManager:          ports.NewPortManager(),
-		clock:                clock,
-		stats:                opts.Stats.FillIn(),
-		handleLocal:          opts.HandleLocal,
-		tables:               DefaultTables(),
-		icmpRateLimiter:      NewICMPRateLimiter(),
-		seed:                 generateRandUint32(),
-		ndpConfigs:           opts.NDPConfigs,
-		nudConfigs:           opts.NUDConfigs,
-		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
-		uniqueIDGenerator:    opts.UniqueID,
-		ndpDisp:              opts.NDPDisp,
-		nudDisp:              opts.NUDDisp,
-		opaqueIIDOpts:        opts.OpaqueIIDOpts,
-		tempIIDSeed:          opts.TempIIDSeed,
-		forwarder:            newForwardQueue(),
-		randomGenerator:      mathrand.New(randSrc),
+		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+		linkAddrResolvers:  make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+		nics:               make(map[tcpip.NICID]*NIC),
+		cleanupEndpoints:   make(map[TransportEndpoint]struct{}),
+		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+		PortManager:        ports.NewPortManager(),
+		clock:              clock,
+		stats:              opts.Stats.FillIn(),
+		handleLocal:        opts.HandleLocal,
+		tables:             DefaultTables(),
+		icmpRateLimiter:    NewICMPRateLimiter(),
+		seed:               generateRandUint32(),
+		nudConfigs:         opts.NUDConfigs,
+		useNeighborCache:   opts.UseNeighborCache,
+		uniqueIDGenerator:  opts.UniqueID,
+		nudDisp:            opts.NUDDisp,
+		randomGenerator:    mathrand.New(randSrc),
 		sendBufferSize: SendBufferSizeOption{
 			Min:     MinBufferSize,
 			Default: DefaultBufferSize,
@@ -734,9 +652,11 @@ func New(opts Options) *Stack {
 			Max:     DefaultMaxBufferSize,
 		},
 	}
+	s.linkResQueue.init()
 
 	// Add specified network protocols.
-	for _, netProto := range opts.NetworkProtocols {
+	for _, netProtoFactory := range opts.NetworkProtocols {
+		netProto := netProtoFactory(s)
 		s.networkProtocols[netProto.Number()] = netProto
 		if r, ok := netProto.(LinkAddressResolver); ok {
 			s.linkAddrResolvers[r.LinkAddressProtocol()] = r
@@ -744,7 +664,8 @@ func New(opts Options) *Stack {
 	}
 
 	// Add specified transport protocols.
-	for _, transProto := range opts.TransportProtocols {
+	for _, transProtoFactory := range opts.TransportProtocols {
+		transProto := transProtoFactory(s)
 		s.transportProtocols[transProto.Number()] = &transportProtocolState{
 			proto: transProto,
 		}
@@ -773,7 +694,7 @@ func (s *Stack) UniqueID() uint64 {
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -790,7 +711,7 @@ func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, op
 // if err != nil {
 //   ...
 // }
-func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	netProto, ok := s.networkProtocols[network]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -802,7 +723,7 @@ func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, optio
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
 // is incorrect.
-func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -817,7 +738,7 @@ func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumb
 // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
 //   ...
 // }
-func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	transProtoState, ok := s.transportProtocols[transport]
 	if !ok {
 		return tcpip.ErrUnknownProtocol
@@ -851,46 +772,37 @@ func (s *Stack) Stats() tcpip.Stats {
 	return s.stats
 }
 
-// SetForwarding enables or disables the packet forwarding between NICs.
-//
-// When forwarding becomes enabled, any host-only state on all NICs will be
-// cleaned up and if IPv6 is enabled, NDP Router Solicitations will be started.
-// When forwarding becomes disabled and if IPv6 is enabled, NDP Router
-// Solicitations will be stopped.
-func (s *Stack) SetForwarding(enable bool) {
-	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
-	s.mu.Lock()
-	defer s.mu.Unlock()
+// SetForwarding enables or disables packet forwarding between NICs for the
+// passed protocol.
+func (s *Stack) SetForwarding(protocolNum tcpip.NetworkProtocolNumber, enable bool) *tcpip.Error {
+	protocol, ok := s.networkProtocols[protocolNum]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
 
-	// If forwarding status didn't change, do nothing further.
-	if s.forwarding == enable {
-		return
+	forwardingProtocol, ok := protocol.(ForwardingNetworkProtocol)
+	if !ok {
+		return tcpip.ErrNotSupported
 	}
 
-	s.forwarding = enable
+	forwardingProtocol.SetForwarding(enable)
+	return nil
+}
 
-	// If this stack does not support IPv6, do nothing further.
-	if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok {
-		return
+// Forwarding returns true if packet forwarding between NICs is enabled for the
+// passed protocol.
+func (s *Stack) Forwarding(protocolNum tcpip.NetworkProtocolNumber) bool {
+	protocol, ok := s.networkProtocols[protocolNum]
+	if !ok {
+		return false
 	}
 
-	if enable {
-		for _, nic := range s.nics {
-			nic.becomeIPv6Router()
-		}
-	} else {
-		for _, nic := range s.nics {
-			nic.becomeIPv6Host()
-		}
+	forwardingProtocol, ok := protocol.(ForwardingNetworkProtocol)
+	if !ok {
+		return false
 	}
-}
 
-// Forwarding returns if the packet forwarding between NICs is enabled.
-func (s *Stack) Forwarding() bool {
-	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	return s.forwarding
+	return forwardingProtocol.Forwarding()
 }
 
 // SetRouteTable assigns the route table to be used by this stack. It
@@ -918,6 +830,20 @@ func (s *Stack) AddRoute(route tcpip.Route) {
 	s.routeTable = append(s.routeTable, route)
 }
 
+// RemoveRoutes removes matching routes from the route table.
+func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	var filteredRoutes []tcpip.Route
+	for _, route := range s.routeTable {
+		if !match(route) {
+			filteredRoutes = append(filteredRoutes, route)
+		}
+	}
+	s.routeTable = filteredRoutes
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
@@ -925,7 +851,7 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	return t.proto.NewEndpoint(s, network, waiterQueue)
+	return t.proto.NewEndpoint(network, waiterQueue)
 }
 
 // NewRawEndpoint creates a new raw transport layer endpoint of the given
@@ -945,7 +871,7 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
-	return t.proto.NewRawEndpoint(s, network, waiterQueue)
+	return t.proto.NewRawEndpoint(network, waiterQueue)
 }
 
 // NewPacketEndpoint creates a new packet endpoint listening for the given
@@ -1016,16 +942,16 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
-// GetNICByName gets the NIC specified by name.
-func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+// GetLinkEndpointByName gets the link endpoint specified by name.
+func (s *Stack) GetLinkEndpointByName(name string) LinkEndpoint {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, nic := range s.nics {
 		if nic.Name() == name {
-			return nic, true
+			return nic.LinkEndpoint
 		}
 	}
-	return nil, false
+	return nil
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -1052,7 +978,8 @@ func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
 		return tcpip.ErrUnknownNICID
 	}
 
-	return nic.disable()
+	nic.disable()
+	return nil
 }
 
 // CheckNIC checks if a NIC is usable.
@@ -1065,7 +992,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 		return false
 	}
 
-	return nic.enabled()
+	return nic.Enabled()
 }
 
 // RemoveNIC removes NIC and all related routes from the network stack.
@@ -1143,19 +1070,19 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	for id, nic := range s.nics {
 		flags := NICStateFlags{
 			Up:          true, // Netstack interfaces are always up.
-			Running:     nic.enabled(),
+			Running:     nic.Enabled(),
 			Promiscuous: nic.isPromiscuousMode(),
-			Loopback:    nic.isLoopback(),
+			Loopback:    nic.IsLoopback(),
 		}
 		nics[id] = NICInfo{
 			Name:              nic.name,
-			LinkAddress:       nic.linkEP.LinkAddress(),
-			ProtocolAddresses: nic.PrimaryAddresses(),
+			LinkAddress:       nic.LinkEndpoint.LinkAddress(),
+			ProtocolAddresses: nic.primaryAddresses(),
 			Flags:             flags,
-			MTU:               nic.linkEP.MTU(),
+			MTU:               nic.LinkEndpoint.MTU(),
 			Stats:             nic.stats,
 			Context:           nic.context,
-			ARPHardwareType:   nic.linkEP.ARPHardwareType(),
+			ARPHardwareType:   nic.LinkEndpoint.ARPHardwareType(),
 		}
 	}
 	return nics
@@ -1209,12 +1136,12 @@ func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tc
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
-	return nic.AddAddress(protocolAddress, peb)
+	return nic.addAddress(protocolAddress, peb)
 }
 
 // RemoveAddress removes an existing network-layer address from the specified
@@ -1224,7 +1151,7 @@ func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
 	defer s.mu.RUnlock()
 
 	if nic, ok := s.nics[id]; ok {
-		return nic.RemoveAddress(addr)
+		return nic.removeAddress(addr)
 	}
 
 	return tcpip.ErrUnknownNICID
@@ -1238,7 +1165,7 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
 
 	nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress)
 	for id, nic := range s.nics {
-		nics[id] = nic.AllAddresses()
+		nics[id] = nic.allPermanentAddresses()
 	}
 	return nics
 }
@@ -1260,7 +1187,7 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol
 	return nic.primaryAddress(protocol), nil
 }
 
-func (s *Stack) getRefEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+func (s *Stack) getAddressEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) AssignableAddressEndpoint {
 	if len(localAddr) == 0 {
 		return nic.primaryEndpoint(netProto, remoteAddr)
 	}
@@ -1277,9 +1204,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
 	needRoute := !(isLocalBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
-		if nic, ok := s.nics[id]; ok && nic.enabled() {
-			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
+		if nic, ok := s.nics[id]; ok && nic.Enabled() {
+			if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
+				return makeRoute(netProto, addressEndpoint.AddressWithPrefix().Address, remoteAddr, nic, addressEndpoint, s.handleLocal && !nic.IsLoopback(), multicastLoop && !nic.IsLoopback()), nil
 			}
 		}
 	} else {
@@ -1287,22 +1214,20 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
 				continue
 			}
-			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
-				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
+			if nic, ok := s.nics[route.NIC]; ok && nic.Enabled() {
+				if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
 						// provided will refer to the link local address.
-						remoteAddr = ref.address()
+						remoteAddr = addressEndpoint.AddressWithPrefix().Address
 					}
 
-					r := makeRoute(netProto, ref.address(), remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
-					r.directedBroadcast = route.Destination.IsBroadcast(remoteAddr)
-
+					r := makeRoute(netProto, addressEndpoint.AddressWithPrefix().Address, remoteAddr, nic, addressEndpoint, s.handleLocal && !nic.IsLoopback(), multicastLoop && !nic.IsLoopback())
 					if len(route.Gateway) > 0 {
 						if needRoute {
 							r.NextHop = route.Gateway
 						}
-					} else if r.directedBroadcast {
+					} else if subnet := addressEndpoint.AddressWithPrefix().Subnet(); subnet.IsBroadcast(remoteAddr) {
 						r.RemoteLinkAddress = header.EthernetBroadcastAddress
 					}
 
@@ -1335,26 +1260,25 @@ func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProto
 
 	// If a NIC is specified, we try to find the address there only.
 	if nicID != 0 {
-		nic := s.nics[nicID]
-		if nic == nil {
+		nic, ok := s.nics[nicID]
+		if !ok {
 			return 0
 		}
 
-		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
-		if ref == nil {
+		addressEndpoint := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if addressEndpoint == nil {
 			return 0
 		}
 
-		ref.decRef()
+		addressEndpoint.DecRef()
 
 		return nic.id
 	}
 
 	// Go through all the NICs.
 	for _, nic := range s.nics {
-		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
-		if ref != nil {
-			ref.decRef()
+		if addressEndpoint := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint); addressEndpoint != nil {
+			addressEndpoint.DecRef()
 			return nic.id
 		}
 	}
@@ -1367,8 +1291,8 @@ func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1383,8 +1307,8 @@ func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[nicID]
-	if nic == nil {
+	nic, ok := s.nics[nicID]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
@@ -1413,11 +1337,36 @@ func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address,
 
 	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
 	linkRes := s.linkAddrResolvers[protocol]
-	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
+	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic, waker)
+}
+
+// Neighbors returns all IP to MAC address associations.
+func (s *Stack) Neighbors(nicID tcpip.NICID) ([]NeighborEntry, *tcpip.Error) {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return nil, tcpip.ErrUnknownNICID
+	}
+
+	return nic.neighbors()
 }
 
-// RemoveWaker implements LinkAddressCache.RemoveWaker.
+// RemoveWaker removes a waker that has been added when link resolution for
+// addr was requested.
 func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+	if s.useNeighborCache {
+		s.mu.RLock()
+		nic, ok := s.nics[nicID]
+		s.mu.RUnlock()
+
+		if ok {
+			nic.removeWaker(addr, waker)
+		}
+		return
+	}
+
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -1427,6 +1376,47 @@ func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.
 	}
 }
 
+// AddStaticNeighbor statically associates an IP address to a MAC address.
+func (s *Stack) AddStaticNeighbor(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.addStaticNeighbor(addr, linkAddr)
+}
+
+// RemoveNeighbor removes an IP to MAC address association previously created
+// either automically or by AddStaticNeighbor. Returns ErrBadAddress if there
+// is no association with the provided address.
+func (s *Stack) RemoveNeighbor(nicID tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.removeNeighbor(addr)
+}
+
+// ClearNeighbors removes all IP to MAC address associations.
+func (s *Stack) ClearNeighbors(nicID tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[nicID]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.clearNeighbors()
+}
+
 // RegisterTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
@@ -1450,10 +1440,9 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 // StartTransportEndpointCleanup removes the endpoint with the given id from
 // the stack transport dispatcher. It also transitions it to the cleanup stage.
 func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
+	s.cleanupEndpointsMu.Lock()
 	s.cleanupEndpoints[ep] = struct{}{}
+	s.cleanupEndpointsMu.Unlock()
 
 	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
@@ -1461,9 +1450,9 @@ func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcp
 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
 // stage.
 func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	delete(s.cleanupEndpoints, ep)
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // FindTransportEndpoint finds an endpoint that most closely matches the provided
@@ -1506,23 +1495,23 @@ func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
 
 // CleanupEndpoints returns endpoints currently in the cleanup state.
 func (s *Stack) CleanupEndpoints() []TransportEndpoint {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
 	for e := range s.cleanupEndpoints {
 		es = append(es, e)
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 	return es
 }
 
 // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
 // for restoring a stack after a save.
 func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
-	s.mu.Lock()
+	s.cleanupEndpointsMu.Lock()
 	for _, e := range es {
 		s.cleanupEndpoints[e] = struct{}{}
 	}
-	s.mu.Unlock()
+	s.cleanupEndpointsMu.Unlock()
 }
 
 // Close closes all currently registered transport endpoints.
@@ -1564,7 +1553,7 @@ func (s *Stack) Wait() {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, n := range s.nics {
-		n.linkEP.Wait()
+		n.LinkEndpoint.Wait()
 	}
 }
 
@@ -1652,7 +1641,7 @@ func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto t
 
 	// Add our own fake ethernet header.
 	ethFields := header.EthernetFields{
-		SrcAddr: nic.linkEP.LinkAddress(),
+		SrcAddr: nic.LinkEndpoint.LinkAddress(),
 		DstAddr: dst,
 		Type:    netProto,
 	}
@@ -1661,7 +1650,7 @@ func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto t
 	vv := buffer.View(fakeHeader).ToVectorisedView()
 	vv.Append(payload)
 
-	if err := nic.linkEP.WriteRawPacket(vv); err != nil {
+	if err := nic.LinkEndpoint.WriteRawPacket(vv); err != nil {
 		return err
 	}
 
@@ -1678,7 +1667,7 @@ func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView)
 		return tcpip.ErrUnknownDevice
 	}
 
-	if err := nic.linkEP.WriteRawPacket(payload); err != nil {
+	if err := nic.LinkEndpoint.WriteRawPacket(payload); err != nil {
 		return err
 	}
 
@@ -1717,18 +1706,17 @@ func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) Tra
 // guarantee provided on which probe will be invoked. Ideally this should only
 // be called once per stack.
 func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
-	s.mu.Lock()
-	s.tcpProbeFunc = probe
-	s.mu.Unlock()
+	s.tcpProbeFunc.Store(probe)
 }
 
 // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
 // otherwise.
 func (s *Stack) GetTCPProbe() TCPProbeFunc {
-	s.mu.Lock()
-	p := s.tcpProbeFunc
-	s.mu.Unlock()
-	return p
+	p := s.tcpProbeFunc.Load()
+	if p == nil {
+		return nil
+	}
+	return p.(TCPProbeFunc)
 }
 
 // RemoveTCPProbe removes an installed TCP probe.
@@ -1737,9 +1725,8 @@ func (s *Stack) GetTCPProbe() TCPProbeFunc {
 // have a probe attached. Endpoints already created will continue to invoke
 // TCP probe.
 func (s *Stack) RemoveTCPProbe() {
-	s.mu.Lock()
-	s.tcpProbeFunc = nil
-	s.mu.Unlock()
+	// This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
+	s.tcpProbeFunc.Store(TCPProbeFunc(nil))
 }
 
 // JoinGroup joins the given multicast group on the given NIC.
@@ -1760,7 +1747,7 @@ func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NIC
 	defer s.mu.RUnlock()
 
 	if nic, ok := s.nics[nicID]; ok {
-		return nic.leaveGroup(multicastAddr)
+		return nic.leaveGroup(protocol, multicastAddr)
 	}
 	return tcpip.ErrUnknownNICID
 }
@@ -1812,53 +1799,18 @@ func (s *Stack) AllowICMPMessage() bool {
 	return s.icmpRateLimiter.Allow()
 }
 
-// IsAddrTentative returns true if addr is tentative on the NIC with ID id.
-//
-// Note that if addr is not associated with a NIC with id ID, then this
-// function will return false. It will only return true if the address is
-// associated with the NIC AND it is tentative.
-func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return false, tcpip.ErrUnknownNICID
-	}
-
-	return nic.isAddrTentative(addr), nil
-}
-
-// DupTentativeAddrDetected attempts to inform the NIC with ID id that a
-// tentative addr on it is a duplicate on a link.
-func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return tcpip.ErrUnknownNICID
-	}
-
-	return nic.dupTentativeAddrDetected(addr)
-}
-
-// SetNDPConfigurations sets the per-interface NDP configurations on the NIC
-// with ID id to c.
-//
-// Note, if c contains invalid NDP configuration values, it will be fixed to
-// use default values for the erroneous values.
-func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error {
+// GetNetworkEndpoint returns the NetworkEndpoint with the specified protocol
+// number installed on the specified NIC.
+func (s *Stack) GetNetworkEndpoint(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NetworkEndpoint, *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	nic, ok := s.nics[id]
+	nic, ok := s.nics[nicID]
 	if !ok {
-		return tcpip.ErrUnknownNICID
+		return nil, tcpip.ErrUnknownNICID
 	}
 
-	nic.setNDPConfigs(c)
-	return nil
+	return nic.getNetworkEndpoint(proto), nil
 }
 
 // NUDConfigurations gets the per-interface NUD configurations.
@@ -1871,7 +1823,7 @@ func (s *Stack) NUDConfigurations(id tcpip.NICID) (NUDConfigurations, *tcpip.Err
 		return NUDConfigurations{}, tcpip.ErrUnknownNICID
 	}
 
-	return nic.NUDConfigs()
+	return nic.nudConfigs()
 }
 
 // SetNUDConfigurations sets the per-interface NUD configurations.
@@ -1890,22 +1842,6 @@ func (s *Stack) SetNUDConfigurations(id tcpip.NICID, c NUDConfigurations) *tcpip
 	return nic.setNUDConfigs(c)
 }
 
-// HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
-// message that it needs to handle.
-func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	nic, ok := s.nics[id]
-	if !ok {
-		return tcpip.ErrUnknownNICID
-	}
-
-	nic.handleNDPRA(ip, ra)
-
-	return nil
-}
-
 // Seed returns a 32 bit value that can be used as a seed value for port
 // picking, ISN generation etc.
 //
@@ -1947,21 +1883,17 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres
 	defer s.mu.RUnlock()
 
 	for _, nic := range s.nics {
-		id := NetworkEndpointID{address}
-
-		if ref, ok := nic.mu.endpoints[id]; ok {
-			nic.mu.RLock()
-			defer nic.mu.RUnlock()
-
-			// An endpoint with this id exists, check if it can be
-			// used and return it.
-			return ref.ep, nil
+		addressEndpoint := nic.getAddressOrCreateTempInner(netProto, address, false /* createTemp */, NeverPrimaryEndpoint)
+		if addressEndpoint == nil {
+			continue
 		}
+		addressEndpoint.DecRef()
+		return nic.getNetworkEndpoint(netProto), nil
 	}
 	return nil, tcpip.ErrBadAddress
 }
 
-// FindNICNameFromID returns the name of the nic for the given NICID.
+// FindNICNameFromID returns the name of the NIC for the given NICID.
 func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -1973,3 +1905,8 @@ func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
 
 	return nic.Name()
 }
+
+// NewJob returns a new tcpip.Job using the stack's clock.
+func (s *Stack) NewJob(l sync.Locker, f func()) *tcpip.Job {
+	return tcpip.NewJob(s.clock, l, f)
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 106645c50..4eed4ced4 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -21,7 +21,6 @@ import (
 	"bytes"
 	"fmt"
 	"math"
-	"net"
 	"sort"
 	"testing"
 	"time"
@@ -29,6 +28,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -68,18 +68,40 @@ const (
 // use the first three: destination address, source address, and transport
 // protocol. They're all one byte fields to simplify parsing.
 type fakeNetworkEndpoint struct {
-	nicID      tcpip.NICID
+	stack.AddressableEndpointState
+
+	mu struct {
+		sync.RWMutex
+
+		enabled bool
+	}
+
+	nic        stack.NetworkInterface
 	proto      *fakeNetworkProtocol
 	dispatcher stack.TransportDispatcher
-	ep         stack.LinkEndpoint
 }
 
-func (f *fakeNetworkEndpoint) MTU() uint32 {
-	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+func (f *fakeNetworkEndpoint) Enable() *tcpip.Error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.enabled = true
+	return nil
+}
+
+func (f *fakeNetworkEndpoint) Enabled() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.enabled
+}
+
+func (f *fakeNetworkEndpoint) Disable() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.enabled = false
 }
 
-func (f *fakeNetworkEndpoint) NICID() tcpip.NICID {
-	return f.nicID
+func (f *fakeNetworkEndpoint) MTU() uint32 {
+	return f.nic.MTU() - uint32(f.MaxHeaderLength())
 }
 
 func (*fakeNetworkEndpoint) DefaultTTL() uint8 {
@@ -111,17 +133,13 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuff
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
-	return f.ep.MaxHeaderLength() + fakeNetHeaderLen
+	return f.nic.MaxHeaderLength() + fakeNetHeaderLen
 }
 
 func (f *fakeNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
 	return 0
 }
 
-func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return f.ep.Capabilities()
-}
-
 func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return f.proto.Number()
 }
@@ -144,7 +162,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 		return nil
 	}
 
-	return f.ep.WritePacket(r, gso, fakeNetNumber, pkt)
+	return f.nic.WritePacket(r, gso, fakeNetNumber, pkt)
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
@@ -156,16 +174,8 @@ func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack
 	return tcpip.ErrNotSupported
 }
 
-func (*fakeNetworkEndpoint) Close() {}
-
-type fakeNetGoodOption bool
-
-type fakeNetBadOption bool
-
-type fakeNetInvalidValueOption int
-
-type fakeNetOptions struct {
-	good bool
+func (f *fakeNetworkEndpoint) Close() {
+	f.AddressableEndpointState.Cleanup()
 }
 
 // fakeNetworkProtocol is a network-layer protocol descriptor. It aggregates the
@@ -174,7 +184,12 @@ type fakeNetOptions struct {
 type fakeNetworkProtocol struct {
 	packetCount     [10]int
 	sendPacketCount [10]int
-	opts            fakeNetOptions
+	defaultTTL      uint8
+
+	mu struct {
+		sync.RWMutex
+		forwarding bool
+	}
 }
 
 func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
@@ -197,44 +212,43 @@ func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Addres
 	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
-func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) stack.NetworkEndpoint {
-	return &fakeNetworkEndpoint{
-		nicID:      nicID,
+func (f *fakeNetworkProtocol) NewEndpoint(nic stack.NetworkInterface, _ stack.LinkAddressCache, _ stack.NUDHandler, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
+	e := &fakeNetworkEndpoint{
+		nic:        nic,
 		proto:      f,
 		dispatcher: dispatcher,
-		ep:         ep,
 	}
+	e.AddressableEndpointState.Init(e)
+	return e
 }
 
-func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) SetOption(option tcpip.SettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeNetGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.DefaultTTLOption:
+		f.defaultTTL = uint8(*v)
 		return nil
-	case fakeNetInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeNetworkProtocol) Option(option tcpip.GettableNetworkProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeNetGoodOption:
-		*v = fakeNetGoodOption(f.opts.good)
+	case *tcpip.DefaultTTLOption:
+		*v = tcpip.DefaultTTLOption(f.defaultTTL)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-// Close implements TransportProtocol.Close.
+// Close implements NetworkProtocol.Close.
 func (*fakeNetworkProtocol) Close() {}
 
-// Wait implements TransportProtocol.Wait.
+// Wait implements NetworkProtocol.Wait.
 func (*fakeNetworkProtocol) Wait() {}
 
-// Parse implements TransportProtocol.Parse.
+// Parse implements NetworkProtocol.Parse.
 func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
 	hdr, ok := pkt.NetworkHeader().Consume(fakeNetHeaderLen)
 	if !ok {
@@ -243,7 +257,21 @@ func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProto
 	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
 }
 
-func fakeNetFactory() stack.NetworkProtocol {
+// Forwarding implements stack.ForwardingNetworkProtocol.
+func (f *fakeNetworkProtocol) Forwarding() bool {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.mu.forwarding
+}
+
+// SetForwarding implements stack.ForwardingNetworkProtocol.
+func (f *fakeNetworkProtocol) SetForwarding(v bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.mu.forwarding = v
+}
+
+func fakeNetFactory(*stack.Stack) stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
 
@@ -280,7 +308,7 @@ func TestNetworkReceive(t *testing.T) {
 	// addresses attached to it: 1 & 2.
 	ep := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	if err := s.CreateNIC(1, ep); err != nil {
 		t.Fatal("CreateNIC failed:", err)
@@ -440,7 +468,7 @@ func TestNetworkSend(t *testing.T) {
 	// existing nic.
 	ep := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	if err := s.CreateNIC(1, ep); err != nil {
 		t.Fatal("NewNIC failed:", err)
@@ -467,7 +495,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep1 := channel.New(10, defaultMTU, "")
@@ -567,7 +595,7 @@ func TestAttachToLinkEndpointImmediately(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
 
 			e := linkEPWithMockedAttach{
@@ -586,7 +614,7 @@ func TestAttachToLinkEndpointImmediately(t *testing.T) {
 
 func TestDisableUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
@@ -598,7 +626,7 @@ func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	e := loopback.New()
@@ -645,7 +673,7 @@ func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
 
 func TestRemoveUnknownNIC(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	if err := s.RemoveNIC(1); err != tcpip.ErrUnknownNICID {
@@ -657,7 +685,7 @@ func TestRemoveNIC(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	e := linkEPWithMockedAttach{
@@ -718,7 +746,7 @@ func TestRouteWithDownNIC(t *testing.T) {
 
 	setup := func(t *testing.T) (*stack.Stack, *channel.Endpoint, *channel.Endpoint) {
 		s := stack.New(stack.Options{
-			NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 		})
 
 		ep1 := channel.New(1, defaultMTU, "")
@@ -884,7 +912,7 @@ func TestRoutes(t *testing.T) {
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep1 := channel.New(10, defaultMTU, "")
@@ -964,7 +992,7 @@ func TestAddressRemoval(t *testing.T) {
 	remoteAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1011,7 +1039,7 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	remoteAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1102,7 +1130,7 @@ func TestEndpointExpiration(t *testing.T) {
 		for _, spoofing := range []bool{true, false} {
 			t.Run(fmt.Sprintf("promiscuous=%t spoofing=%t", promiscuous, spoofing), func(t *testing.T) {
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+					NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 				})
 
 				ep := channel.New(10, defaultMTU, "")
@@ -1260,7 +1288,7 @@ func TestEndpointExpiration(t *testing.T) {
 
 func TestPromiscuousMode(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1312,7 +1340,7 @@ func TestSpoofingWithAddress(t *testing.T) {
 	dstAddr := tcpip.Address("\x03")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1378,7 +1406,7 @@ func TestSpoofingNoAddress(t *testing.T) {
 	dstAddr := tcpip.Address("\x02")
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1441,7 +1469,7 @@ func verifyRoute(gotRoute, wantRoute stack.Route) error {
 
 func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 
 	ep := channel.New(10, defaultMTU, "")
@@ -1484,7 +1512,7 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 
 	// Create a new stack with two NICs.
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep); err != nil {
@@ -1585,7 +1613,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
 
 			ep := channel.New(10, defaultMTU, "")
@@ -1640,46 +1668,24 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 	}
 }
 
-func TestNetworkOptions(t *testing.T) {
+func TestNetworkOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{},
 	})
 
-	// Try an unsupported network protocol.
-	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetNetworkProtocolOption(fakeNet2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
+	opt := tcpip.DefaultTTLOption(5)
+	if err := s.SetNetworkProtocolOption(fakeNetNumber, &opt); err != nil {
+		t.Fatalf("s.SetNetworkProtocolOption(%d, &%T(%d)): %s", fakeNetNumber, opt, opt, err)
 	}
 
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.NetworkProtocol)
-	}{
-		{fakeNetGoodOption(true), nil, func(t *testing.T, p stack.NetworkProtocol) {
-			t.Helper()
-			fakeNet := p.(*fakeNetworkProtocol)
-			if fakeNet.opts.good != true {
-				t.Fatalf("fakeNet.opts.good = false, want = true")
-			}
-			var v fakeNetGoodOption
-			if err := s.NetworkProtocolOption(fakeNetNumber, &v); err != nil {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) returned v = %v, want = true", v)
-			}
-		}},
-		{fakeNetBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeNetInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
+	var optGot tcpip.DefaultTTLOption
+	if err := s.NetworkProtocolOption(fakeNetNumber, &optGot); err != nil {
+		t.Fatalf("s.NetworkProtocolOption(%d, &%T): %s", fakeNetNumber, optGot, err)
 	}
-	for _, tc := range testCases {
-		if got := s.SetNetworkProtocolOption(fakeNetNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetNetworkProtocolOption(fakeNet, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.NetworkProtocolInstance(fakeNetNumber))
-		}
+
+	if opt != optGot {
+		t.Errorf("got optGot = %d, want = %d", optGot, opt)
 	}
 }
 
@@ -1691,7 +1697,7 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
 					for never := 0; never < 3; never++ {
 						t.Run(fmt.Sprintf("never=%d", never), func(t *testing.T) {
 							s := stack.New(stack.Options{
-								NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+								NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 							})
 							ep := channel.New(10, defaultMTU, "")
 							if err := s.CreateNIC(1, ep); err != nil {
@@ -1758,7 +1764,7 @@ func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
 
 func TestGetMainNICAddressAddRemove(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep); err != nil {
@@ -1843,7 +1849,7 @@ func verifyAddresses(t *testing.T, expectedAddresses, gotAddresses []tcpip.Proto
 func TestAddAddress(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1870,7 +1876,7 @@ func TestAddAddress(t *testing.T) {
 func TestAddProtocolAddress(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1904,7 +1910,7 @@ func TestAddProtocolAddress(t *testing.T) {
 func TestAddAddressWithOptions(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -1935,7 +1941,7 @@ func TestAddAddressWithOptions(t *testing.T) {
 func TestAddProtocolAddressWithOptions(t *testing.T) {
 	const nicID = 1
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(nicID, ep); err != nil {
@@ -2056,7 +2062,7 @@ func TestCreateNICWithOptions(t *testing.T) {
 
 func TestNICStats(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 	})
 	ep1 := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, ep1); err != nil {
@@ -2097,7 +2103,7 @@ func TestNICStats(t *testing.T) {
 		t.Errorf("got Tx.Packets.Value() = %d, ep1.Drain() = %d", got, want)
 	}
 
-	if got, want := s.NICInfo()[1].Stats.Tx.Bytes.Value(), uint64(len(payload)); got != want {
+	if got, want := s.NICInfo()[1].Stats.Tx.Bytes.Value(), uint64(len(payload)+fakeNetHeaderLen); got != want {
 		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
 	}
 }
@@ -2123,9 +2129,9 @@ func TestNICForwarding(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 			})
-			s.SetForwarding(true)
+			s.SetForwarding(fakeNetNumber, true)
 
 			ep1 := channel.New(10, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, ep1); err != nil {
@@ -2247,7 +2253,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 		nicName      string
 		autoGen      bool
 		linkAddr     tcpip.LinkAddress
-		iidOpts      stack.OpaqueInterfaceIdentifierOptions
+		iidOpts      ipv6.OpaqueInterfaceIdentifierOptions
 		shouldGen    bool
 		expectedAddr tcpip.Address
 	}{
@@ -2263,7 +2269,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "nic1",
 			autoGen:  false,
 			linkAddr: linkAddr1,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:],
 			},
@@ -2308,7 +2314,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "nic1",
 			autoGen:  true,
 			linkAddr: linkAddr1,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:],
 			},
@@ -2320,7 +2326,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 		{
 			name:    "OIID Empty MAC and empty nicName",
 			autoGen: true,
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:1],
 			},
@@ -2332,7 +2338,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test",
 			autoGen:  true,
 			linkAddr: "\x01\x02\x03",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:2],
 			},
@@ -2344,7 +2350,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test2",
 			autoGen:  true,
 			linkAddr: "\x01\x02\x03\x04\x05\x06",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 				SecretKey:     secretKey[:3],
 			},
@@ -2356,7 +2362,7 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 			nicName:  "test3",
 			autoGen:  true,
 			linkAddr: "\x00\x00\x00\x00\x00\x00",
-			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+			iidOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: nicNameFunc,
 			},
 			shouldGen:    true,
@@ -2370,10 +2376,11 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
 			opts := stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: test.autoGen,
-				NDPDisp:              &ndpDisp,
-				OpaqueIIDOpts:        test.iidOpts,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: test.autoGen,
+					NDPDisp:              &ndpDisp,
+					OpaqueIIDOpts:        test.iidOpts,
+				})},
 			}
 
 			e := channel.New(0, 1280, test.linkAddr)
@@ -2445,15 +2452,15 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 
 	tests := []struct {
 		name          string
-		opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions
+		opaqueIIDOpts ipv6.OpaqueInterfaceIdentifierOptions
 	}{
 		{
 			name:          "IID From MAC",
-			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{},
+			opaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{},
 		},
 		{
 			name: "Opaque IID",
-			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			opaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
 				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
 					return nicName
 				},
@@ -2464,9 +2471,10 @@ func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			opts := stack.Options{
-				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-				AutoGenIPv6LinkLocal: true,
-				OpaqueIIDOpts:        test.opaqueIIDOpts,
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					AutoGenIPv6LinkLocal: true,
+					OpaqueIIDOpts:        test.opaqueIIDOpts,
+				})},
 			}
 
 			e := loopback.New()
@@ -2495,12 +2503,13 @@ func TestNICAutoGenAddrDoesDAD(t *testing.T) {
 	ndpDisp := ndpDispatcher{
 		dadC: make(chan ndpDADEvent),
 	}
-	ndpConfigs := stack.DefaultNDPConfigurations()
+	ndpConfigs := ipv6.DefaultNDPConfigurations()
 	opts := stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs:           ndpConfigs,
-		AutoGenIPv6LinkLocal: true,
-		NDPDisp:              &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs:           ndpConfigs,
+			AutoGenIPv6LinkLocal: true,
+			NDPDisp:              &ndpDisp,
+		})},
 	}
 
 	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
@@ -2556,7 +2565,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 		for _, ps := range pebs {
 			t.Run(fmt.Sprintf("%d-to-%d", pi, ps), func(t *testing.T) {
 				s := stack.New(stack.Options{
-					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+					NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
 				})
 				ep1 := channel.New(10, defaultMTU, "")
 				if err := s.CreateNIC(1, ep1); err != nil {
@@ -2847,14 +2856,15 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			e := channel.New(0, 1280, linkAddr1)
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				NDPConfigs: stack.NDPConfigurations{
-					HandleRAs:                  true,
-					AutoGenGlobalAddresses:     true,
-					AutoGenTempGlobalAddresses: true,
-				},
-				NDPDisp: &ndpDispatcher{},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+					NDPConfigs: ipv6.NDPConfigurations{
+						HandleRAs:                  true,
+						AutoGenGlobalAddresses:     true,
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDispatcher{},
+				})},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2903,7 +2913,7 @@ func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
 
 	e := loopback.New()
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
 	})
 	nicOpts := stack.NICOptions{Disabled: true}
 	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
@@ -2955,7 +2965,7 @@ func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
 	const nicID = 1
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocol},
 	})
 	e := channel.New(10, 1280, linkAddr1)
 	if err := s.CreateNIC(1, e); err != nil {
@@ -3016,7 +3026,7 @@ func TestJoinLeaveMulticastOnNICEnableDisable(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			e := loopback.New()
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
 			})
 			nicOpts := stack.NICOptions{Disabled: true}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
@@ -3093,12 +3103,13 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 		dadC: make(chan ndpDADEvent),
 	}
 	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			DupAddrDetectTransmits: dadTransmits,
-			RetransmitTimer:        retransmitTimer,
-		},
-		NDPDisp: &ndpDisp,
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPConfigs: ipv6.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+			},
+			NDPDisp: &ndpDisp,
+		})},
 	}
 
 	e := channel.New(dadTransmits, 1280, linkAddr1)
@@ -3457,7 +3468,7 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
 			})
 			ep := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, ep); err != nil {
@@ -3495,7 +3506,7 @@ func TestResolveWith(t *testing.T) {
 	)
 
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol},
 	})
 	ep := channel.New(0, defaultMTU, "")
 	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
@@ -3505,17 +3516,17 @@ func TestResolveWith(t *testing.T) {
 	addr := tcpip.ProtocolAddress{
 		Protocol: header.IPv4ProtocolNumber,
 		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+			Address:   tcpip.Address([]byte{192, 168, 1, 58}),
 			PrefixLen: 24,
 		},
 	}
 	if err := s.AddProtocolAddress(nicID, addr); err != nil {
-		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+		t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, addr, err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}})
 
-	remoteAddr := tcpip.Address(net.ParseIP("192.168.1.59").To4())
+	remoteAddr := tcpip.Address([]byte{192, 168, 1, 59})
 	r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, remoteAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, remoteAddr, header.IPv4ProtocolNumber, err)
@@ -3533,3 +3544,215 @@ func TestResolveWith(t *testing.T) {
 		t.Fatal("got r.IsResolutionRequired() = true, want = false")
 	}
 }
+
+// TestRouteReleaseAfterAddrRemoval tests that releasing a Route after its
+// associated address is removed should not cause a panic.
+func TestRouteReleaseAfterAddrRemoval(t *testing.T) {
+	const (
+		nicID      = 1
+		localAddr  = tcpip.Address("\x01")
+		remoteAddr = tcpip.Address("\x02")
+	)
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
+	})
+
+	ep := channel.New(0, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+	if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
+		t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, fakeNetNumber, localAddr, err)
+	}
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	r, err := s.FindRoute(nicID, localAddr, remoteAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, localAddr, remoteAddr, fakeNetNumber, err)
+	}
+	// Should not panic.
+	defer r.Release()
+
+	// Check that removing the same address fails.
+	if err := s.RemoveAddress(nicID, localAddr); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, localAddr, err)
+	}
+}
+
+func TestGetNetworkEndpoint(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name         string
+		protoFactory stack.NetworkProtocolFactory
+		protoNum     tcpip.NetworkProtocolNumber
+	}{
+		{
+			name:         "IPv4",
+			protoFactory: ipv4.NewProtocol,
+			protoNum:     ipv4.ProtocolNumber,
+		},
+		{
+			name:         "IPv6",
+			protoFactory: ipv6.NewProtocol,
+			protoNum:     ipv6.ProtocolNumber,
+		},
+	}
+
+	factories := make([]stack.NetworkProtocolFactory, 0, len(tests))
+	for _, test := range tests {
+		factories = append(factories, test.protoFactory)
+	}
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: factories,
+	})
+
+	if err := s.CreateNIC(nicID, channel.New(0, defaultMTU, "")); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ep, err := s.GetNetworkEndpoint(nicID, test.protoNum)
+			if err != nil {
+				t.Fatalf("s.GetNetworkEndpoint(%d, %d): %s", nicID, test.protoNum, err)
+			}
+
+			if got := ep.NetworkProtocolNumber(); got != test.protoNum {
+				t.Fatalf("got ep.NetworkProtocolNumber() = %d, want = %d", got, test.protoNum)
+			}
+		})
+	}
+}
+
+func TestGetMainNICAddressWhenNICDisabled(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{fakeNetFactory},
+	})
+
+	if err := s.CreateNIC(nicID, channel.New(0, defaultMTU, "")); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: fakeNetNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   "\x01",
+			PrefixLen: 8,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %#v): %s", nicID, protocolAddress, err)
+	}
+
+	// Check that we get the right initial address and prefix length.
+	if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil {
+		t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err)
+	} else if gotAddr != protocolAddress.AddressWithPrefix {
+		t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix)
+	}
+
+	// Should still get the address when the NIC is diabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("DisableNIC(%d): %s", nicID, err)
+	}
+	if gotAddr, err := s.GetMainNICAddress(nicID, fakeNetNumber); err != nil {
+		t.Fatalf("GetMainNICAddress(%d, %d): %s", nicID, fakeNetNumber, err)
+	} else if gotAddr != protocolAddress.AddressWithPrefix {
+		t.Fatalf("got GetMainNICAddress(%d, %d) = %s, want = %s", nicID, fakeNetNumber, gotAddr, protocolAddress.AddressWithPrefix)
+	}
+}
+
+// TestAddRoute tests Stack.AddRoute
+func TestAddRoute(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{})
+
+	subnet1, err := tcpip.NewSubnet("\x00", "\x00")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	subnet2, err := tcpip.NewSubnet("\x01", "\x01")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expected := []tcpip.Route{
+		{Destination: subnet1, Gateway: "\x00", NIC: 1},
+		{Destination: subnet2, Gateway: "\x00", NIC: 1},
+	}
+
+	// Initialize the route table with one route.
+	s.SetRouteTable([]tcpip.Route{expected[0]})
+
+	// Add another route.
+	s.AddRoute(expected[1])
+
+	rt := s.GetRouteTable()
+	if got, want := len(rt), len(expected); got != want {
+		t.Fatalf("Unexpected route table length got = %d, want = %d", got, want)
+	}
+	for i, route := range rt {
+		if got, want := route, expected[i]; got != want {
+			t.Fatalf("Unexpected route got = %#v, want = %#v", got, want)
+		}
+	}
+}
+
+// TestRemoveRoutes tests Stack.RemoveRoutes
+func TestRemoveRoutes(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{})
+
+	addressToRemove := tcpip.Address("\x01")
+	subnet1, err := tcpip.NewSubnet(addressToRemove, "\x01")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	subnet2, err := tcpip.NewSubnet(addressToRemove, "\x01")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	subnet3, err := tcpip.NewSubnet("\x02", "\x02")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Initialize the route table with three routes.
+	s.SetRouteTable([]tcpip.Route{
+		{Destination: subnet1, Gateway: "\x00", NIC: 1},
+		{Destination: subnet2, Gateway: "\x00", NIC: 1},
+		{Destination: subnet3, Gateway: "\x00", NIC: 1},
+	})
+
+	// Remove routes with the specific address.
+	s.RemoveRoutes(func(r tcpip.Route) bool {
+		return r.Destination.ID() == addressToRemove
+	})
+
+	expected := []tcpip.Route{{Destination: subnet3, Gateway: "\x00", NIC: 1}}
+	rt := s.GetRouteTable()
+	if got, want := len(rt), len(expected); got != want {
+		t.Fatalf("Unexpected route table length got = %d, want = %d", got, want)
+	}
+	for i, route := range rt {
+		if got, want := route, expected[i]; got != want {
+			t.Fatalf("Unexpected route got = %#v, want = %#v", got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index b902c6ca9..35e5b1a2e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -155,7 +155,7 @@ func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
 func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 
-	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.nic.ID()]
 	if !ok {
 		if mpep, ok = epsByNIC.endpoints[0]; !ok {
 			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
@@ -165,7 +165,7 @@ func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, p
 
 	// If this is a broadcast or multicast datagram, deliver the datagram to all
 	// endpoints bound to the right device.
-	if isMulticastOrBroadcast(id.LocalAddress) {
+	if isInboundMulticastOrBroadcast(r) {
 		mpep.handlePacketAll(r, id, pkt)
 		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 		return
@@ -526,7 +526,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 	// If the packet is a UDP broadcast or multicast, then find all matching
 	// transport endpoints.
-	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
+	if protocol == header.UDPProtocolNumber && isInboundMulticastOrBroadcast(r) {
 		eps.mu.RLock()
 		destEPs := eps.findAllEndpointsLocked(id)
 		eps.mu.RUnlock()
@@ -544,9 +544,11 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		return true
 	}
 
-	// If the packet is a TCP packet with a non-unicast source or destination
-	// address, then do nothing further and instruct the caller to do the same.
-	if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+	// If the packet is a TCP packet with a unspecified source or non-unicast
+	// destination address, then do nothing further and instruct the caller to do
+	// the same. The network layer handles address validation for specified source
+	// addresses.
+	if protocol == header.TCPProtocolNumber && (!isSpecified(r.LocalAddress) || !isSpecified(r.RemoteAddress) || isInboundMulticastOrBroadcast(r)) {
 		// TCP can only be used to communicate between a single source and a
 		// single destination; the addresses must be unicast.
 		r.Stats().TCP.InvalidSegmentsReceived.Increment()
@@ -626,7 +628,7 @@ func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolN
 	epsByNIC.mu.RLock()
 	eps.mu.RUnlock()
 
-	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.nic.ID()]
 	if !ok {
 		if mpep, ok = epsByNIC.endpoints[0]; !ok {
 			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
@@ -677,10 +679,10 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
 	eps.mu.Unlock()
 }
 
-func isMulticastOrBroadcast(addr tcpip.Address) bool {
-	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr)
+func isInboundMulticastOrBroadcast(r *Route) bool {
+	return r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) || header.IsV6MulticastAddress(r.LocalAddress)
 }
 
-func isUnicast(addr tcpip.Address) bool {
-	return addr != header.IPv4Any && addr != header.IPv6Any && !isMulticastOrBroadcast(addr)
+func isSpecified(addr tcpip.Address) bool {
+	return addr != header.IPv4Any && addr != header.IPv6Any
 }
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 1339edc2d..698c8609e 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -51,8 +51,8 @@ type testContext struct {
 // newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
 func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
 	for _, linkEpID := range linkEpIDs {
@@ -182,8 +182,8 @@ func TestTransportDemuxerRegister(t *testing.T) {
 	} {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			var wq waiter.Queue
 			ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
@@ -312,8 +312,8 @@ func TestBindToDeviceDistribution(t *testing.T) {
 							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
-						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						if err := ep.SetSockOpt(&bindToDeviceOption); err != nil {
+							t.Fatalf("SetSockOpt(&%T(%d)) on endpoint %d failed: %s", bindToDeviceOption, bindToDeviceOption, i, err)
 						}
 
 						var dstAddr tcpip.Address
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 6c6e44468..6b8071467 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -28,7 +28,7 @@ import (
 
 const (
 	fakeTransNumber    tcpip.TransportProtocolNumber = 1
-	fakeTransHeaderLen                               = 3
+	fakeTransHeaderLen int                           = 3
 )
 
 // fakeTransportEndpoint is a transport-layer protocol endpoint. It counts
@@ -39,7 +39,7 @@ const (
 // use it.
 type fakeTransportEndpoint struct {
 	stack.TransportEndpointInfo
-	stack    *stack.Stack
+
 	proto    *fakeTransportProtocol
 	peerAddr tcpip.Address
 	route    stack.Route
@@ -53,14 +53,14 @@ func (f *fakeTransportEndpoint) Info() tcpip.EndpointInfo {
 	return &f.TransportEndpointInfo
 }
 
-func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
+func (*fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+func (*fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
 
-func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
-	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
+func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
+	return &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
 func (f *fakeTransportEndpoint) Abort() {
@@ -100,12 +100,12 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 	return int64(len(v)), nil, nil
 }
 
-func (f *fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
+func (*fakeTransportEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -130,11 +130,7 @@ func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.E
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-	}
+func (*fakeTransportEndpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
 	return tcpip.ErrInvalidEndpointState
 }
 
@@ -147,7 +143,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	f.peerAddr = addr.Addr
 
 	// Find the route.
-	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
+	r, err := f.proto.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		return tcpip.ErrNoRoute
 	}
@@ -155,7 +151,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.ID.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
+	err = f.proto.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
 	if err != nil {
 		return err
 	}
@@ -169,7 +165,7 @@ func (f *fakeTransportEndpoint) UniqueID() uint64 {
 	return f.uniqueID
 }
 
-func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
+func (*fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
 	return nil
 }
 
@@ -184,7 +180,7 @@ func (*fakeTransportEndpoint) Listen(int) *tcpip.Error {
 	return nil
 }
 
-func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (f *fakeTransportEndpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	if len(f.acceptQueue) == 0 {
 		return nil, nil, nil
 	}
@@ -194,7 +190,7 @@ func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.
 }
 
 func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
-	if err := f.stack.RegisterTransportEndpoint(
+	if err := f.proto.stack.RegisterTransportEndpoint(
 		a.NIC,
 		[]tcpip.NetworkProtocolNumber{fakeNetNumber},
 		fakeTransNumber,
@@ -222,7 +218,6 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
 		f.acceptQueue = append(f.acceptQueue, fakeTransportEndpoint{
-			stack: f.stack,
 			TransportEndpointInfo: stack.TransportEndpointInfo{
 				ID:       f.ID,
 				NetProto: f.NetProto,
@@ -239,19 +234,19 @@ func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, s
 	f.proto.controlCount++
 }
 
-func (f *fakeTransportEndpoint) State() uint32 {
+func (*fakeTransportEndpoint) State() uint32 {
 	return 0
 }
 
-func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
+func (*fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
-	return stack.IPTables{}, nil
-}
+func (*fakeTransportEndpoint) Resume(*stack.Stack) {}
 
-func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
+func (*fakeTransportEndpoint) Wait() {}
 
-func (f *fakeTransportEndpoint) Wait() {}
+func (*fakeTransportEndpoint) LastError() *tcpip.Error {
+	return nil
+}
 
 type fakeTransportGoodOption bool
 
@@ -266,6 +261,8 @@ type fakeTransportProtocolOptions struct {
 // fakeTransportProtocol is a transport-layer protocol descriptor. It
 // aggregates the number of packets received via endpoints of this protocol.
 type fakeTransportProtocol struct {
+	stack *stack.Stack
+
 	packetCount  int
 	controlCount int
 	opts         fakeTransportProtocolOptions
@@ -275,11 +272,11 @@ func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber {
 	return fakeTransNumber
 }
 
-func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
+func (f *fakeTransportProtocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newFakeTransportEndpoint(f, netProto, f.stack.UniqueID()), nil
 }
 
-func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (*fakeTransportProtocol) NewRawEndpoint(tcpip.NetworkProtocolNumber, *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return nil, tcpip.ErrUnknownProtocol
 }
 
@@ -291,26 +288,24 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
-	return true
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
+	return stack.UnknownDestinationPacketHandled
 }
 
-func (f *fakeTransportProtocol) SetOption(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case fakeTransportGoodOption:
-		f.opts.good = bool(v)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		f.opts.good = bool(*v)
 		return nil
-	case fakeTransportInvalidValueOption:
-		return tcpip.ErrInvalidOptionValue
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
+func (f *fakeTransportProtocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *fakeTransportGoodOption:
-		*v = fakeTransportGoodOption(f.opts.good)
+	case *tcpip.TCPModerateReceiveBufferOption:
+		*v = tcpip.TCPModerateReceiveBufferOption(f.opts.good)
 		return nil
 	default:
 		return tcpip.ErrUnknownProtocolOption
@@ -332,15 +327,15 @@ func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
 	return ok
 }
 
-func fakeTransFactory() stack.TransportProtocol {
-	return &fakeTransportProtocol{}
+func fakeTransFactory(s *stack.Stack) stack.TransportProtocol {
+	return &fakeTransportProtocol{stack: s}
 }
 
 func TestTransportReceive(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -410,8 +405,8 @@ func TestTransportReceive(t *testing.T) {
 func TestTransportControlReceive(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -487,8 +482,8 @@ func TestTransportControlReceive(t *testing.T) {
 func TestTransportSend(t *testing.T) {
 	linkEP := channel.New(10, defaultMTU, "")
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 	if err := s.CreateNIC(1, linkEP); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -533,54 +528,29 @@ func TestTransportSend(t *testing.T) {
 
 func TestTransportOptions(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
 
-	// Try an unsupported transport protocol.
-	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
-		t.Fatalf("SetTransportProtocolOption(fakeTrans2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
-	}
-
-	testCases := []struct {
-		option   interface{}
-		wantErr  *tcpip.Error
-		verifier func(t *testing.T, p stack.TransportProtocol)
-	}{
-		{fakeTransportGoodOption(true), nil, func(t *testing.T, p stack.TransportProtocol) {
-			t.Helper()
-			fakeTrans := p.(*fakeTransportProtocol)
-			if fakeTrans.opts.good != true {
-				t.Fatalf("fakeTrans.opts.good = false, want = true")
-			}
-			var v fakeTransportGoodOption
-			if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) = %v, want = nil, where v is option %T", v, err)
-			}
-			if v != true {
-				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) returned v = %v, want = true", v)
-			}
-
-		}},
-		{fakeTransportBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
-		{fakeTransportInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
-	}
-	for _, tc := range testCases {
-		if got := s.SetTransportProtocolOption(fakeTransNumber, tc.option); got != tc.wantErr {
-			t.Errorf("s.SetTransportProtocolOption(fakeTrans, %v) = %v, want = %v", tc.option, got, tc.wantErr)
-		}
-		if tc.verifier != nil {
-			tc.verifier(t, s.TransportProtocolInstance(fakeTransNumber))
-		}
+	v := tcpip.TCPModerateReceiveBufferOption(true)
+	if err := s.SetTransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Errorf("s.SetTransportProtocolOption(fakeTrans, &%T(%t)): %s", v, v, err)
+	}
+	v = false
+	if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
+		t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &%T): %s", v, err)
+	}
+	if !v {
+		t.Fatalf("got tcpip.TCPModerateReceiveBufferOption = false, want = true")
 	}
 }
 
 func TestTransportForwarding(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
-		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{fakeNetFactory},
+		TransportProtocols: []stack.TransportProtocolFactory{fakeTransFactory},
 	})
-	s.SetForwarding(true)
+	s.SetForwarding(fakeNetNumber, true)
 
 	// TODO(b/123449044): Change this to a channel NIC.
 	ep1 := loopback.New()
@@ -635,7 +605,7 @@ func TestTransportForwarding(t *testing.T) {
 		Data: req.ToVectorisedView(),
 	}))
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err != nil || aep == nil {
 		t.Fatalf("Accept failed: %v, %v", aep, err)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 07c85ce59..ac4d39d3e 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -111,6 +111,7 @@ var (
 	ErrBroadcastDisabled         = &Error{msg: "broadcast socket option disabled"}
 	ErrNotPermitted              = &Error{msg: "operation not permitted"}
 	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
+	ErrMalformedHeader           = &Error{msg: "header is malformed"}
 )
 
 var messageToError map[string]*Error
@@ -159,6 +160,7 @@ func StringToError(s string) *Error {
 			ErrBroadcastDisabled,
 			ErrNotPermitted,
 			ErrAddressFamilyNotSupported,
+			ErrMalformedHeader,
 		}
 
 		messageToError = make(map[string]*Error)
@@ -237,6 +239,14 @@ type Timer interface {
 // network node. Or, in the case of unix endpoints, it may represent a path.
 type Address string
 
+// WithPrefix returns the address with a prefix that represents a point subnet.
+func (a Address) WithPrefix() AddressWithPrefix {
+	return AddressWithPrefix{
+		Address:   a,
+		PrefixLen: len(a) * 8,
+	}
+}
+
 // AddressMask is a bitmask for an address.
 type AddressMask string
 
@@ -346,10 +356,9 @@ func (s *Subnet) IsBroadcast(address Address) bool {
 	return s.Prefix() <= 30 && s.Broadcast() == address
 }
 
-// Equal returns true if s equals o.
-//
-// Needed to use cmp.Equal on Subnet as its fields are unexported.
+// Equal returns true if this Subnet is equal to the given Subnet.
 func (s Subnet) Equal(o Subnet) bool {
+	// If this changes, update Route.Equal accordingly.
 	return s == o
 }
 
@@ -561,7 +570,10 @@ type Endpoint interface {
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *waiter.Queue, *Error)
+	//
+	// If peerAddr is not nil then it is populated with the peer address of the
+	// returned endpoint.
+	Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, *Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
@@ -578,8 +590,8 @@ type Endpoint interface {
 	// if waiter.EventIn is set, the endpoint is immediately readable.
 	Readiness(mask waiter.EventMask) waiter.EventMask
 
-	// SetSockOpt sets a socket option. opt should be one of the *Option types.
-	SetSockOpt(opt interface{}) *Error
+	// SetSockOpt sets a socket option.
+	SetSockOpt(opt SettableSocketOption) *Error
 
 	// SetSockOptBool sets a socket option, for simple cases where a value
 	// has the bool type.
@@ -589,9 +601,8 @@ type Endpoint interface {
 	// has the int type.
 	SetSockOptInt(opt SockOptInt, v int) *Error
 
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// *Option types.
-	GetSockOpt(opt interface{}) *Error
+	// GetSockOpt gets a socket option.
+	GetSockOpt(opt GettableSocketOption) *Error
 
 	// GetSockOptBool gets a socket option for simple cases where a return
 	// value has the bool type.
@@ -620,6 +631,9 @@ type Endpoint interface {
 
 	// SetOwner sets the task owner to the endpoint owner.
 	SetOwner(owner PacketOwner)
+
+	// LastError clears and returns the last error reported by the endpoint.
+	LastError() *Error
 }
 
 // LinkPacketInfo holds Link layer information for a received packet.
@@ -748,6 +762,10 @@ const (
 	// endpoint that all packets being written have an IP header and the
 	// endpoint should not attach an IP header.
 	IPHdrIncludedOption
+
+	// AcceptConnOption is used by GetSockOptBool to indicate if the
+	// socket is a listening socket.
+	AcceptConnOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -839,14 +857,134 @@ const (
 	PMTUDiscoveryProbe
 )
 
-// ErrorOption is used in GetSockOpt to specify that the last error reported by
-// the endpoint should be cleared and returned.
-type ErrorOption struct{}
+// GettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be queried.
+type GettableNetworkProtocolOption interface {
+	isGettableNetworkProtocolOption()
+}
+
+// SettableNetworkProtocolOption is a marker interface for network protocol
+// options that may be set.
+type SettableNetworkProtocolOption interface {
+	isSettableNetworkProtocolOption()
+}
+
+// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
+// a default TTL.
+type DefaultTTLOption uint8
+
+func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}
+
+func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}
+
+// GettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be queried.
+type GettableTransportProtocolOption interface {
+	isGettableTransportProtocolOption()
+}
+
+// SettableTransportProtocolOption is a marker interface for transport protocol
+// options that may be set.
+type SettableTransportProtocolOption interface {
+	isSettableTransportProtocolOption()
+}
+
+// TCPSACKEnabled the SACK option for TCP.
+//
+// See: https://tools.ietf.org/html/rfc2018.
+type TCPSACKEnabled bool
+
+func (*TCPSACKEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPSACKEnabled) isSettableTransportProtocolOption() {}
+
+// TCPRecovery is the loss deteoction algorithm used by TCP.
+type TCPRecovery int32
+
+func (*TCPRecovery) isGettableTransportProtocolOption() {}
+
+func (*TCPRecovery) isSettableTransportProtocolOption() {}
+
+const (
+	// TCPRACKLossDetection indicates RACK is used for loss detection and
+	// recovery.
+	TCPRACKLossDetection TCPRecovery = 1 << iota
+
+	// TCPRACKStaticReoWnd indicates the reordering window should not be
+	// adjusted when DSACK is received.
+	TCPRACKStaticReoWnd
+
+	// TCPRACKNoDupTh indicates RACK should not consider the classic three
+	// duplicate acknowledgements rule to mark the segments as lost. This
+	// is used when reordering is not detected.
+	TCPRACKNoDupTh
+)
+
+// TCPDelayEnabled enables/disables Nagle's algorithm in TCP.
+type TCPDelayEnabled bool
+
+func (*TCPDelayEnabled) isGettableTransportProtocolOption() {}
+
+func (*TCPDelayEnabled) isSettableTransportProtocolOption() {}
+
+// TCPSendBufferSizeRangeOption is the send buffer size range for TCP.
+type TCPSendBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP.
+type TCPReceiveBufferSizeRangeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {}
+
+func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {}
+
+// TCPAvailableCongestionControlOption is the supported congestion control
+// algorithms for TCP
+type TCPAvailableCongestionControlOption string
+
+func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {}
+
+// TCPModerateReceiveBufferOption enables/disables receive buffer moderation
+// for TCP.
+type TCPModerateReceiveBufferOption bool
+
+func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {}
+
+func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {}
+
+// GettableSocketOption is a marker interface for socket options that may be
+// queried.
+type GettableSocketOption interface {
+	isGettableSocketOption()
+}
+
+// SettableSocketOption is a marker interface for socket options that may be
+// configured.
+type SettableSocketOption interface {
+	isSettableSocketOption()
+}
 
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
 
+func (*BindToDeviceOption) isGettableSocketOption() {}
+
+func (*BindToDeviceOption) isSettableSocketOption() {}
+
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO(b/64800844): Add and populate stat fields.
@@ -855,68 +993,143 @@ type TCPInfoOption struct {
 	RTTVar time.Duration
 }
 
+func (*TCPInfoOption) isGettableSocketOption() {}
+
 // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
 // connection must remain idle before the first TCP keepalive packet is sent.
 // Once this time is reached, KeepaliveIntervalOption is used instead.
 type KeepaliveIdleOption time.Duration
 
+func (*KeepaliveIdleOption) isGettableSocketOption() {}
+
+func (*KeepaliveIdleOption) isSettableSocketOption() {}
+
 // KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
 // interval between sending TCP keepalive packets.
 type KeepaliveIntervalOption time.Duration
 
+func (*KeepaliveIntervalOption) isGettableSocketOption() {}
+
+func (*KeepaliveIntervalOption) isSettableSocketOption() {}
+
 // TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
 // specified timeout for a given TCP connection.
 // See: RFC5482 for details.
 type TCPUserTimeoutOption time.Duration
 
+func (*TCPUserTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPUserTimeoutOption) isSettableSocketOption() {}
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
 
-// AvailableCongestionControlOption is used to query the supported congestion
-// control algorithms.
-type AvailableCongestionControlOption string
+func (*CongestionControlOption) isGettableSocketOption() {}
+
+func (*CongestionControlOption) isSettableSocketOption() {}
 
-// ModerateReceiveBufferOption is used by buffer moderation.
-type ModerateReceiveBufferOption bool
+func (*CongestionControlOption) isGettableTransportProtocolOption() {}
+
+func (*CongestionControlOption) isSettableTransportProtocolOption() {}
 
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
 type TCPLingerTimeoutOption time.Duration
 
+func (*TCPLingerTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableSocketOption() {}
+
+func (*TCPLingerTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPLingerTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TIME_WAIT state
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitTimeoutOption) isSettableTransportProtocolOption() {}
+
 // TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
 // accept to return a completed connection only when there is data to be
 // read. This usually means the listening socket will drop the final ACK
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
+func (*TCPDeferAcceptOption) isGettableSocketOption() {}
+
+func (*TCPDeferAcceptOption) isSettableSocketOption() {}
+
 // TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MinRTO used by the Stack.
 type TCPMinRTOOption time.Duration
 
+func (*TCPMinRTOOption) isGettableSocketOption() {}
+
+func (*TCPMinRTOOption) isSettableSocketOption() {}
+
+func (*TCPMinRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMinRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
 // default MaxRTO used by the Stack.
 type TCPMaxRTOOption time.Duration
 
+func (*TCPMaxRTOOption) isGettableSocketOption() {}
+
+func (*TCPMaxRTOOption) isSettableSocketOption() {}
+
+func (*TCPMaxRTOOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRTOOption) isSettableTransportProtocolOption() {}
+
 // TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum number of retransmits after which we time out the connection.
 type TCPMaxRetriesOption uint64
 
+func (*TCPMaxRetriesOption) isGettableSocketOption() {}
+
+func (*TCPMaxRetriesOption) isSettableSocketOption() {}
+
+func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
 // the number of endpoints that can be in SYN-RCVD state before the stack
 // switches to using SYN cookies.
 type TCPSynRcvdCountThresholdOption uint64
 
+func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {}
+
 // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
 // default for number of times SYN is retransmitted before aborting a connect.
 type TCPSynRetriesOption uint8
 
+func (*TCPSynRetriesOption) isGettableSocketOption() {}
+
+func (*TCPSynRetriesOption) isSettableSocketOption() {}
+
+func (*TCPSynRetriesOption) isGettableTransportProtocolOption() {}
+
+func (*TCPSynRetriesOption) isSettableTransportProtocolOption() {}
+
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
 type MulticastInterfaceOption struct {
@@ -924,45 +1137,61 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
-// MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
-// AddMembershipOption and RemoveMembershipOption.
+func (*MulticastInterfaceOption) isGettableSocketOption() {}
+
+func (*MulticastInterfaceOption) isSettableSocketOption() {}
+
+// MembershipOption is used to identify a multicast membership on an interface.
 type MembershipOption struct {
 	NIC           NICID
 	InterfaceAddr Address
 	MulticastAddr Address
 }
 
-// AddMembershipOption is used by SetSockOpt/GetSockOpt to join a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+// AddMembershipOption identifies a multicast group to join on some interface.
 type AddMembershipOption MembershipOption
 
-// RemoveMembershipOption is used by SetSockOpt/GetSockOpt to leave a multicast
-// group identified by the given multicast address, on the interface matching
-// the given interface address.
+func (*AddMembershipOption) isSettableSocketOption() {}
+
+// RemoveMembershipOption identifies a multicast group to leave on some
+// interface.
 type RemoveMembershipOption MembershipOption
 
+func (*RemoveMembershipOption) isSettableSocketOption() {}
+
 // OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
-// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
-// a default TTL.
-type DefaultTTLOption uint8
+func (*OutOfBandInlineOption) isGettableSocketOption() {}
+
+func (*OutOfBandInlineOption) isSettableSocketOption() {}
 
 // SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
 // classic BPF filter on a given endpoint.
 type SocketDetachFilterOption int
 
+func (*SocketDetachFilterOption) isSettableSocketOption() {}
+
 // OriginalDestinationOption is used to get the original destination address
 // and port of a redirected packet.
 type OriginalDestinationOption FullAddress
 
+func (*OriginalDestinationOption) isGettableSocketOption() {}
+
 // TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
 // specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
 // new connections when it is safe from protocol viewpoint.
 type TCPTimeWaitReuseOption uint8
 
+func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}
+
+func (*TCPTimeWaitReuseOption) isGettableTransportProtocolOption() {}
+
+func (*TCPTimeWaitReuseOption) isSettableTransportProtocolOption() {}
+
 const (
 	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
 	// be reused for new connections.
@@ -978,6 +1207,19 @@ const (
 	TCPTimeWaitReuseLoopbackOnly
 )
 
+// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
+// duration for which a socket lingers before returning from Close.
+//
+// +stateify savable
+type LingerOption struct {
+	Enabled bool
+	Timeout time.Duration
+}
+
+func (*LingerOption) isGettableSocketOption() {}
+
+func (*LingerOption) isSettableSocketOption() {}
+
 // IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
@@ -1017,10 +1259,19 @@ func (r Route) String() string {
 	return out.String()
 }
 
+// Equal returns true if the given Route is equal to this Route.
+func (r Route) Equal(to Route) bool {
+	// NOTE: This relies on the fact that r.Destination == to.Destination
+	return r == to
+}
+
 // TransportProtocolNumber is the number of a transport protocol.
 type TransportProtocolNumber uint32
 
-// NetworkProtocolNumber is the number of a network protocol.
+// NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet
+// frame.
+//
+// See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml
 type NetworkProtocolNumber uint32
 
 // A StatCounter keeps track of a statistic.
@@ -1183,6 +1434,10 @@ type ICMPv6ReceivedPacketStats struct {
 	// Invalid is the total number of ICMPv6 packets received that the
 	// transport layer could not parse.
 	Invalid *StatCounter
+
+	// RouterOnlyPacketsDroppedByHost is the total number of ICMPv6 packets
+	// dropped due to being router-specific packets.
+	RouterOnlyPacketsDroppedByHost *StatCounter
 }
 
 // ICMPStats collects ICMP-specific stats (both v4 and v6).
@@ -1238,6 +1493,18 @@ type IPStats struct {
 	// MalformedFragmentsReceived is the total number of IP Fragments that were
 	// dropped due to the fragment failing validation checks.
 	MalformedFragmentsReceived *StatCounter
+
+	// IPTablesPreroutingDropped is the total number of IP packets dropped
+	// in the Prerouting chain.
+	IPTablesPreroutingDropped *StatCounter
+
+	// IPTablesInputDropped is the total number of IP packets dropped in
+	// the Input chain.
+	IPTablesInputDropped *StatCounter
+
+	// IPTablesOutputDropped is the total number of IP packets dropped in
+	// the Output chain.
+	IPTablesOutputDropped *StatCounter
 }
 
 // TCPStats collects TCP-specific stats.
@@ -1366,9 +1633,6 @@ type UDPStats struct {
 
 	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
 	ChecksumErrors *StatCounter
-
-	// InvalidSourceAddress is the number of invalid sourced datagrams dropped.
-	InvalidSourceAddress *StatCounter
 }
 
 // Stats holds statistics about the networking stack.
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
index 6d52af98a..34aab32d0 100644
--- a/pkg/tcpip/tests/integration/BUILD
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -5,12 +5,21 @@ package(licenses = ["notice"])
 go_test(
     name = "integration_test",
     size = "small",
-    srcs = ["multicast_broadcast_test.go"],
+    srcs = [
+        "forward_test.go",
+        "link_resolution_test.go",
+        "loopback_test.go",
+        "multicast_broadcast_test.go",
+    ],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/ethernet",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/pipe",
+        "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/tests/integration/forward_test.go b/pkg/tcpip/tests/integration/forward_test.go
new file mode 100644
index 000000000..0dcef7b04
--- /dev/null
+++ b/pkg/tcpip/tests/integration/forward_test.go
@@ -0,0 +1,379 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"net"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestForwarding(t *testing.T) {
+	const (
+		host1NICLinkAddr   = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+		routerNIC1LinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x07")
+		routerNIC2LinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x08")
+		host2NICLinkAddr   = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+		host1NICID   = 1
+		routerNICID1 = 2
+		routerNICID2 = 3
+		host2NICID   = 4
+
+		listenPort = 8080
+	)
+
+	host1IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+			PrefixLen: 24,
+		},
+	}
+	routerNIC1IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+			PrefixLen: 24,
+		},
+	}
+	routerNIC2IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("10.0.0.1").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host2IPv4Addr := tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("10.0.0.2").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host1IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+	routerNIC1IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	routerNIC2IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("b::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	host2IPv6Addr := tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("b::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+
+	type endpointAndAddresses struct {
+		serverEP         tcpip.Endpoint
+		serverAddr       tcpip.Address
+		serverReadableCH chan struct{}
+
+		clientEP         tcpip.Endpoint
+		clientAddr       tcpip.Address
+		clientReadableCH chan struct{}
+	}
+
+	newEP := func(t *testing.T, s *stack.Stack, transProto tcpip.TransportProtocolNumber, netProto tcpip.NetworkProtocolNumber) (tcpip.Endpoint, chan struct{}) {
+		t.Helper()
+		var wq waiter.Queue
+		we, ch := waiter.NewChannelEntry(nil)
+		wq.EventRegister(&we, waiter.EventIn)
+		ep, err := s.NewEndpoint(transProto, netProto, &wq)
+		if err != nil {
+			t.Fatalf("s.NewEndpoint(%d, %d, _): %s", transProto, netProto, err)
+		}
+
+		t.Cleanup(func() {
+			wq.EventUnregister(&we)
+		})
+
+		return ep, ch
+	}
+
+	tests := []struct {
+		name       string
+		epAndAddrs func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses
+	}{
+		{
+			name: "IPv4 host1 server with host2 client",
+			epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses {
+				ep1, ep1WECH := newEP(t, host1Stack, udp.ProtocolNumber, ipv4.ProtocolNumber)
+				ep2, ep2WECH := newEP(t, host2Stack, udp.ProtocolNumber, ipv4.ProtocolNumber)
+				return endpointAndAddresses{
+					serverEP:         ep1,
+					serverAddr:       host1IPv4Addr.AddressWithPrefix.Address,
+					serverReadableCH: ep1WECH,
+
+					clientEP:         ep2,
+					clientAddr:       host2IPv4Addr.AddressWithPrefix.Address,
+					clientReadableCH: ep2WECH,
+				}
+			},
+		},
+		{
+			name: "IPv6 host2 server with host1 client",
+			epAndAddrs: func(t *testing.T, host1Stack, routerStack, host2Stack *stack.Stack) endpointAndAddresses {
+				ep1, ep1WECH := newEP(t, host2Stack, udp.ProtocolNumber, ipv6.ProtocolNumber)
+				ep2, ep2WECH := newEP(t, host1Stack, udp.ProtocolNumber, ipv6.ProtocolNumber)
+				return endpointAndAddresses{
+					serverEP:         ep1,
+					serverAddr:       host2IPv6Addr.AddressWithPrefix.Address,
+					serverReadableCH: ep1WECH,
+
+					clientEP:         ep2,
+					clientAddr:       host1IPv6Addr.AddressWithPrefix.Address,
+					clientReadableCH: ep2WECH,
+				}
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			stackOpts := stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			}
+
+			host1Stack := stack.New(stackOpts)
+			routerStack := stack.New(stackOpts)
+			host2Stack := stack.New(stackOpts)
+
+			host1NIC, routerNIC1 := pipe.New(host1NICLinkAddr, routerNIC1LinkAddr)
+			routerNIC2, host2NIC := pipe.New(routerNIC2LinkAddr, host2NICLinkAddr)
+
+			if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
+				t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
+			}
+			if err := routerStack.CreateNIC(routerNICID1, ethernet.New(routerNIC1)); err != nil {
+				t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID1, err)
+			}
+			if err := routerStack.CreateNIC(routerNICID2, ethernet.New(routerNIC2)); err != nil {
+				t.Fatalf("routerStack.CreateNIC(%d, _): %s", routerNICID2, err)
+			}
+			if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
+				t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
+			}
+
+			if err := routerStack.SetForwarding(ipv4.ProtocolNumber, true); err != nil {
+				t.Fatalf("routerStack.SetForwarding(%d): %s", ipv4.ProtocolNumber, err)
+			}
+			if err := routerStack.SetForwarding(ipv6.ProtocolNumber, true); err != nil {
+				t.Fatalf("routerStack.SetForwarding(%d): %s", ipv6.ProtocolNumber, err)
+			}
+
+			if err := host1Stack.AddAddress(host1NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host1Stack.AddAddress(%d, %d, %s): %s", host1NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := routerStack.AddAddress(routerNICID1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("routerStack.AddAddress(%d, %d, %s): %s", routerNICID1, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := routerStack.AddAddress(routerNICID2, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("routerStack.AddAddress(%d, %d, %s): %s", routerNICID2, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := host2Stack.AddAddress(host2NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host2Stack.AddAddress(%d, %d, %s): %s", host2NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv4Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv4Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID1, routerNIC1IPv4Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID1, routerNIC1IPv4Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID2, routerNIC2IPv4Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID2, routerNIC2IPv4Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv4Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv4Addr, err)
+			}
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv6Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv6Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID1, routerNIC1IPv6Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID1, routerNIC1IPv6Addr, err)
+			}
+			if err := routerStack.AddProtocolAddress(routerNICID2, routerNIC2IPv6Addr); err != nil {
+				t.Fatalf("routerStack.AddProtocolAddress(%d, %#v): %s", routerNICID2, routerNIC2IPv6Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv6Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv6Addr, err)
+			}
+
+			host1Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC1IPv4Addr.AddressWithPrefix.Address,
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC1IPv6Addr.AddressWithPrefix.Address,
+					NIC:         host1NICID,
+				},
+			})
+			routerStack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: routerNIC1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID1,
+				},
+				tcpip.Route{
+					Destination: routerNIC1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID1,
+				},
+				tcpip.Route{
+					Destination: routerNIC2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID2,
+				},
+				tcpip.Route{
+					Destination: routerNIC2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         routerNICID2,
+				},
+			})
+			host2Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC2IPv4Addr.AddressWithPrefix.Address,
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					Gateway:     routerNIC2IPv6Addr.AddressWithPrefix.Address,
+					NIC:         host2NICID,
+				},
+			})
+
+			epsAndAddrs := test.epAndAddrs(t, host1Stack, routerStack, host2Stack)
+			defer epsAndAddrs.serverEP.Close()
+			defer epsAndAddrs.clientEP.Close()
+
+			serverAddr := tcpip.FullAddress{Addr: epsAndAddrs.serverAddr, Port: listenPort}
+			if err := epsAndAddrs.serverEP.Bind(serverAddr); err != nil {
+				t.Fatalf("epsAndAddrs.serverEP.Bind(%#v): %s", serverAddr, err)
+			}
+			clientAddr := tcpip.FullAddress{Addr: epsAndAddrs.clientAddr}
+			if err := epsAndAddrs.clientEP.Bind(clientAddr); err != nil {
+				t.Fatalf("epsAndAddrs.clientEP.Bind(%#v): %s", clientAddr, err)
+			}
+
+			write := func(ep tcpip.Endpoint, data []byte, to *tcpip.FullAddress) {
+				t.Helper()
+
+				dataPayload := tcpip.SlicePayload(data)
+				wOpts := tcpip.WriteOptions{To: to}
+				n, ch, err := ep.Write(dataPayload, wOpts)
+				if err == tcpip.ErrNoLinkAddress {
+					// Wait for link resolution to complete.
+					<-ch
+
+					n, _, err = ep.Write(dataPayload, wOpts)
+				} else if err != nil {
+					t.Fatalf("ep.Write(_, _): %s", err)
+				}
+
+				if err != nil {
+					t.Fatalf("ep.Write(_, _): %s", err)
+				}
+				if want := int64(len(data)); n != want {
+					t.Fatalf("got ep.Write(_, _) = (%d, _, _), want = (%d, _, _)", n, want)
+				}
+			}
+
+			data := []byte{1, 2, 3, 4}
+			write(epsAndAddrs.clientEP, data, &serverAddr)
+
+			read := func(ch chan struct{}, ep tcpip.Endpoint, data []byte, expectedFrom tcpip.Address) tcpip.FullAddress {
+				t.Helper()
+
+				// Wait for the endpoint to be readable.
+				<-ch
+
+				var addr tcpip.FullAddress
+				v, _, err := ep.Read(&addr)
+				if err != nil {
+					t.Fatalf("ep.Read(_): %s", err)
+				}
+
+				if diff := cmp.Diff(v, buffer.View(data)); diff != "" {
+					t.Errorf("received data mismatch (-want +got):\n%s", diff)
+				}
+				if addr.Addr != expectedFrom {
+					t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, expectedFrom)
+				}
+
+				if t.Failed() {
+					t.FailNow()
+				}
+
+				return addr
+			}
+
+			addr := read(epsAndAddrs.serverReadableCH, epsAndAddrs.serverEP, data, epsAndAddrs.clientAddr)
+			// Unspecify the NIC since NIC IDs are meaningless across stacks.
+			addr.NIC = 0
+
+			data = tcpip.SlicePayload([]byte{5, 6, 7, 8, 9, 10, 11, 12})
+			write(epsAndAddrs.serverEP, data, &addr)
+			addr = read(epsAndAddrs.clientReadableCH, epsAndAddrs.clientEP, data, epsAndAddrs.serverAddr)
+			if addr.Port != listenPort {
+				t.Errorf("got addr.Port = %d, want = %d", addr.Port, listenPort)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/link_resolution_test.go b/pkg/tcpip/tests/integration/link_resolution_test.go
new file mode 100644
index 000000000..6ddcda70c
--- /dev/null
+++ b/pkg/tcpip/tests/integration/link_resolution_test.go
@@ -0,0 +1,220 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"net"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
+	"gvisor.dev/gvisor/pkg/tcpip/link/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var (
+	host1NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x06")
+	host2NICLinkAddr = tcpip.LinkAddress("\x02\x03\x03\x04\x05\x09")
+
+	host1IPv4Addr = tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.1").To4()),
+			PrefixLen: 24,
+		},
+	}
+	host2IPv4Addr = tcpip.ProtocolAddress{
+		Protocol: ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.0.2").To4()),
+			PrefixLen: 8,
+		},
+	}
+	host1IPv6Addr = tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::1").To16()),
+			PrefixLen: 64,
+		},
+	}
+	host2IPv6Addr = tcpip.ProtocolAddress{
+		Protocol: ipv6.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("a::2").To16()),
+			PrefixLen: 64,
+		},
+	}
+)
+
+// TestPing tests that two hosts can ping eachother when link resolution is
+// enabled.
+func TestPing(t *testing.T) {
+	const (
+		host1NICID = 1
+		host2NICID = 4
+
+		// icmpDataOffset is the offset to the data in both ICMPv4 and ICMPv6 echo
+		// request/reply packets.
+		icmpDataOffset = 8
+	)
+
+	tests := []struct {
+		name       string
+		transProto tcpip.TransportProtocolNumber
+		netProto   tcpip.NetworkProtocolNumber
+		remoteAddr tcpip.Address
+		icmpBuf    func(*testing.T) buffer.View
+	}{
+		{
+			name:       "IPv4 Ping",
+			transProto: icmp.ProtocolNumber4,
+			netProto:   ipv4.ProtocolNumber,
+			remoteAddr: host2IPv4Addr.AddressWithPrefix.Address,
+			icmpBuf: func(t *testing.T) buffer.View {
+				data := [8]byte{1, 2, 3, 4, 5, 6, 7, 8}
+				hdr := header.ICMPv4(make([]byte, header.ICMPv4MinimumSize+len(data)))
+				hdr.SetType(header.ICMPv4Echo)
+				if n := copy(hdr.Payload(), data[:]); n != len(data) {
+					t.Fatalf("copied %d bytes but expected to copy %d bytes", n, len(data))
+				}
+				return buffer.View(hdr)
+			},
+		},
+		{
+			name:       "IPv6 Ping",
+			transProto: icmp.ProtocolNumber6,
+			netProto:   ipv6.ProtocolNumber,
+			remoteAddr: host2IPv6Addr.AddressWithPrefix.Address,
+			icmpBuf: func(t *testing.T) buffer.View {
+				data := [8]byte{1, 2, 3, 4, 5, 6, 7, 8}
+				hdr := header.ICMPv6(make([]byte, header.ICMPv6MinimumSize+len(data)))
+				hdr.SetType(header.ICMPv6EchoRequest)
+				if n := copy(hdr.Payload(), data[:]); n != len(data) {
+					t.Fatalf("copied %d bytes but expected to copy %d bytes", n, len(data))
+				}
+				return buffer.View(hdr)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			stackOpts := stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4, icmp.NewProtocol6},
+			}
+
+			host1Stack := stack.New(stackOpts)
+			host2Stack := stack.New(stackOpts)
+
+			host1NIC, host2NIC := pipe.New(host1NICLinkAddr, host2NICLinkAddr)
+
+			if err := host1Stack.CreateNIC(host1NICID, ethernet.New(host1NIC)); err != nil {
+				t.Fatalf("host1Stack.CreateNIC(%d, _): %s", host1NICID, err)
+			}
+			if err := host2Stack.CreateNIC(host2NICID, ethernet.New(host2NIC)); err != nil {
+				t.Fatalf("host2Stack.CreateNIC(%d, _): %s", host2NICID, err)
+			}
+
+			if err := host1Stack.AddAddress(host1NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host1Stack.AddAddress(%d, %d, %s): %s", host1NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+			if err := host2Stack.AddAddress(host2NICID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+				t.Fatalf("host2Stack.AddAddress(%d, %d, %s): %s", host2NICID, arp.ProtocolNumber, arp.ProtocolAddress, err)
+			}
+
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv4Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv4Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv4Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv4Addr, err)
+			}
+			if err := host1Stack.AddProtocolAddress(host1NICID, host1IPv6Addr); err != nil {
+				t.Fatalf("host1Stack.AddProtocolAddress(%d, %#v): %s", host1NICID, host1IPv6Addr, err)
+			}
+			if err := host2Stack.AddProtocolAddress(host2NICID, host2IPv6Addr); err != nil {
+				t.Fatalf("host2Stack.AddProtocolAddress(%d, %#v): %s", host2NICID, host2IPv6Addr, err)
+			}
+
+			host1Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host1IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+				tcpip.Route{
+					Destination: host1IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host1NICID,
+				},
+			})
+			host2Stack.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: host2IPv4Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+				tcpip.Route{
+					Destination: host2IPv6Addr.AddressWithPrefix.Subnet(),
+					NIC:         host2NICID,
+				},
+			})
+
+			var wq waiter.Queue
+			we, waiterCH := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			ep, err := host1Stack.NewEndpoint(test.transProto, test.netProto, &wq)
+			if err != nil {
+				t.Fatalf("host1Stack.NewEndpoint(%d, %d, _): %s", test.transProto, test.netProto, err)
+			}
+			defer ep.Close()
+
+			// The first write should trigger link resolution.
+			icmpBuf := test.icmpBuf(t)
+			wOpts := tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: test.remoteAddr}}
+			if _, ch, err := ep.Write(tcpip.SlicePayload(icmpBuf), wOpts); err != tcpip.ErrNoLinkAddress {
+				t.Fatalf("got ep.Write(_, _) = %s, want = %s", err, tcpip.ErrNoLinkAddress)
+			} else {
+				// Wait for link resolution to complete.
+				<-ch
+			}
+			if n, _, err := ep.Write(tcpip.SlicePayload(icmpBuf), wOpts); err != nil {
+				t.Fatalf("ep.Write(_, _): %s", err)
+			} else if want := int64(len(icmpBuf)); n != want {
+				t.Fatalf("got ep.Write(_, _) = (%d, _, _), want = (%d, _, _)", n, want)
+			}
+
+			// Wait for the endpoint to be readable.
+			<-waiterCH
+
+			var addr tcpip.FullAddress
+			v, _, err := ep.Read(&addr)
+			if err != nil {
+				t.Fatalf("ep.Read(_): %s", err)
+			}
+			if diff := cmp.Diff(v[icmpDataOffset:], icmpBuf[icmpDataOffset:]); diff != "" {
+				t.Errorf("received data mismatch (-want +got):\n%s", diff)
+			}
+			if addr.Addr != test.remoteAddr {
+				t.Errorf("got addr.Addr = %s, want = %s", addr.Addr, test.remoteAddr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/tests/integration/loopback_test.go b/pkg/tcpip/tests/integration/loopback_test.go
new file mode 100644
index 000000000..e8caf09ba
--- /dev/null
+++ b/pkg/tcpip/tests/integration/loopback_test.go
@@ -0,0 +1,314 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var _ ipv6.NDPDispatcher = (*ndpDispatcher)(nil)
+
+type ndpDispatcher struct{}
+
+func (*ndpDispatcher) OnDuplicateAddressDetectionStatus(tcpip.NICID, tcpip.Address, bool, *tcpip.Error) {
+}
+
+func (*ndpDispatcher) OnDefaultRouterDiscovered(tcpip.NICID, tcpip.Address) bool {
+	return false
+}
+
+func (*ndpDispatcher) OnDefaultRouterInvalidated(tcpip.NICID, tcpip.Address) {}
+
+func (*ndpDispatcher) OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) bool {
+	return false
+}
+
+func (*ndpDispatcher) OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) {}
+
+func (*ndpDispatcher) OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool {
+	return true
+}
+
+func (*ndpDispatcher) OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) {}
+
+func (*ndpDispatcher) OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) {}
+
+func (*ndpDispatcher) OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration) {}
+
+func (*ndpDispatcher) OnDNSSearchListOption(tcpip.NICID, []string, time.Duration) {}
+
+func (*ndpDispatcher) OnDHCPv6Configuration(tcpip.NICID, ipv6.DHCPv6ConfigurationFromNDPRA) {}
+
+// TestInitialLoopbackAddresses tests that the loopback interface does not
+// auto-generate a link-local address when it is brought up.
+func TestInitialLoopbackAddresses(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocolWithOptions(ipv6.Options{
+			NDPDisp:              &ndpDispatcher{},
+			AutoGenIPv6LinkLocal: true,
+			OpaqueIIDOpts: ipv6.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(nicID tcpip.NICID, nicName string) string {
+					t.Fatalf("should not attempt to get name for NIC with ID = %d; nicName = %s", nicID, nicName)
+					return ""
+				},
+			},
+		})},
+	})
+
+	if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	nicsInfo := s.NICInfo()
+	if nicInfo, ok := nicsInfo[nicID]; !ok {
+		t.Fatalf("did not find NIC with ID = %d in s.NICInfo() = %#v", nicID, nicsInfo)
+	} else if got := len(nicInfo.ProtocolAddresses); got != 0 {
+		t.Fatalf("got len(nicInfo.ProtocolAddresses) = %d, want = 0; nicInfo.ProtocolAddresses = %#v", got, nicInfo.ProtocolAddresses)
+	}
+}
+
+// TestLoopbackAcceptAllInSubnet tests that a loopback interface considers
+// itself bound to all addresses in the subnet of an assigned address.
+func TestLoopbackAcceptAllInSubnet(t *testing.T) {
+	const (
+		nicID     = 1
+		localPort = 80
+	)
+
+	data := []byte{1, 2, 3, 4}
+
+	ipv4ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv4ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	ipv4Bytes := []byte(ipv4Addr.Address)
+	ipv4Bytes[len(ipv4Bytes)-1]++
+	otherIPv4Address := tcpip.Address(ipv4Bytes)
+
+	ipv6ProtocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: ipv6Addr,
+	}
+	ipv6Bytes := []byte(ipv6Addr.Address)
+	ipv6Bytes[len(ipv6Bytes)-1]++
+	otherIPv6Address := tcpip.Address(ipv6Bytes)
+
+	tests := []struct {
+		name       string
+		addAddress tcpip.ProtocolAddress
+		bindAddr   tcpip.Address
+		dstAddr    tcpip.Address
+		expectRx   bool
+	}{
+		{
+			name:       "IPv4 bind to wildcard and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to wildcard send to other address",
+			addAddress: ipv4ProtocolAddress,
+			dstAddr:    remoteIPv4Addr,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind to other subnet-local address and send to assigned address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    ipv4Addr.Address,
+			expectRx:   false,
+		},
+		{
+			name:       "IPv4 bind and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   otherIPv4Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv4 bind to assigned address and send to other subnet-local address",
+			addAddress: ipv4ProtocolAddress,
+			bindAddr:   ipv4Addr.Address,
+			dstAddr:    otherIPv4Address,
+			expectRx:   false,
+		},
+
+		{
+			name:       "IPv6 bind and send to assigned address",
+			addAddress: ipv6ProtocolAddress,
+			bindAddr:   ipv6Addr.Address,
+			dstAddr:    ipv6Addr.Address,
+			expectRx:   true,
+		},
+		{
+			name:       "IPv6 bind to wildcard and send to other subnet-local address",
+			addAddress: ipv6ProtocolAddress,
+			dstAddr:    otherIPv6Address,
+			expectRx:   false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			if err := s.AddProtocolAddress(nicID, test.addAddress); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, test.addAddress, err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			wq := waiter.Queue{}
+			rep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer rep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort}
+			if err := rep.Bind(bindAddr); err != nil {
+				t.Fatalf("rep.Bind(%+v): %s", bindAddr, err)
+			}
+
+			sep, err := s.NewEndpoint(udp.ProtocolNumber, test.addAddress.Protocol, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, test.addAddress.Protocol, err)
+			}
+			defer sep.Close()
+
+			wopts := tcpip.WriteOptions{
+				To: &tcpip.FullAddress{
+					Addr: test.dstAddr,
+					Port: localPort,
+				},
+			}
+			n, _, err := sep.Write(tcpip.SlicePayload(data), wopts)
+			if err != nil {
+				t.Fatalf("sep.Write(_, _): %s", err)
+			}
+			if want := int64(len(data)); n != want {
+				t.Fatalf("got sep.Write(_, _) = (%d, _, nil), want = (%d, _, nil)", n, want)
+			}
+
+			if gotPayload, _, err := rep.Read(nil); test.expectRx {
+				if err != nil {
+					t.Fatalf("reep.Read(nil): %s", err)
+				}
+				if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+					t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Fatalf("got rep.Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
+
+// TestLoopbackSubnetLifetimeBoundToAddr tests that the lifetime of an address
+// in a loopback interface's associated subnet is bound to the permanently bound
+// address.
+func TestLoopbackSubnetLifetimeBoundToAddr(t *testing.T) {
+	const nicID = 1
+
+	protoAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv4.ProtocolNumber,
+		AddressWithPrefix: ipv4Addr,
+	}
+	addrBytes := []byte(ipv4Addr.Address)
+	addrBytes[len(addrBytes)-1]++
+	otherAddr := tcpip.Address(addrBytes)
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+	})
+	if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _): %s", nicID, err)
+	}
+	if err := s.AddProtocolAddress(nicID, protoAddr); err != nil {
+		t.Fatalf("s.AddProtocolAddress(%d, %#v): %s", nicID, protoAddr, err)
+	}
+	s.SetRouteTable([]tcpip.Route{
+		tcpip.Route{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         nicID,
+		},
+	})
+
+	r, err := s.FindRoute(nicID, otherAddr, remoteIPv4Addr, ipv4.ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("s.FindRoute(%d, %s, %s, %d, false): %s", nicID, otherAddr, remoteIPv4Addr, ipv4.ProtocolNumber, err)
+	}
+	defer r.Release()
+
+	params := stack.NetworkHeaderParams{
+		Protocol: 111,
+		TTL:      64,
+		TOS:      stack.DefaultTOS,
+	}
+	data := buffer.View([]byte{1, 2, 3, 4})
+	if err := r.WritePacket(nil /* gso */, params, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               data.ToVectorisedView(),
+	})); err != nil {
+		t.Fatalf("r.WritePacket(nil, %#v, _): %s", params, err)
+	}
+
+	// Removing the address should make the endpoint invalid.
+	if err := s.RemoveAddress(nicID, protoAddr.AddressWithPrefix.Address); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, protoAddr.AddressWithPrefix.Address, err)
+	}
+	if err := r.WritePacket(nil /* gso */, params, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               data.ToVectorisedView(),
+	})); err != tcpip.ErrInvalidEndpointState {
+		t.Fatalf("got r.WritePacket(nil, %#v, _) = %s, want = %s", params, err, tcpip.ErrInvalidEndpointState)
+	}
+}
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
index 9f0dd4d6d..f1028823b 100644
--- a/pkg/tcpip/tests/integration/multicast_broadcast_test.go
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -79,6 +80,7 @@ func TestPingMulticastBroadcast(t *testing.T) {
 			SrcAddr:     remoteIPv4Addr,
 			DstAddr:     dst,
 		})
+		ip.SetChecksum(^ip.CalculateChecksum())
 
 		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: hdr.View().ToVectorisedView(),
@@ -139,11 +141,9 @@ func TestPingMulticastBroadcast(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			ipv4Proto := ipv4.NewProtocol()
-			ipv6Proto := ipv6.NewProtocol()
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4Proto, ipv6Proto},
-				TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4(), icmp.NewProtocol6()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{icmp.NewProtocol4, icmp.NewProtocol6},
 			})
 			// We only expect a single packet in response to our ICMP Echo Request.
 			e := channel.New(1, defaultMTU, "")
@@ -175,18 +175,18 @@ func TestPingMulticastBroadcast(t *testing.T) {
 			var rxICMP func(*channel.Endpoint, tcpip.Address)
 			var expectedSrc tcpip.Address
 			var expectedDst tcpip.Address
-			var proto stack.NetworkProtocol
+			var protoNum tcpip.NetworkProtocolNumber
 			switch l := len(test.dstAddr); l {
 			case header.IPv4AddressSize:
 				rxICMP = rxIPv4ICMP
 				expectedSrc = ipv4Addr.Address
 				expectedDst = remoteIPv4Addr
-				proto = ipv4Proto
+				protoNum = header.IPv4ProtocolNumber
 			case header.IPv6AddressSize:
 				rxICMP = rxIPv6ICMP
 				expectedSrc = ipv6Addr.Address
 				expectedDst = remoteIPv6Addr
-				proto = ipv6Proto
+				protoNum = header.IPv6ProtocolNumber
 			default:
 				t.Fatalf("got unexpected address length = %d bytes", l)
 			}
@@ -204,7 +204,7 @@ func TestPingMulticastBroadcast(t *testing.T) {
 				t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, expectedDst)
 			}
 
-			src, dst := proto.ParseAddresses(pkt.Pkt.NetworkHeader().View())
+			src, dst := s.NetworkProtocolInstance(protoNum).ParseAddresses(stack.PayloadSince(pkt.Pkt.NetworkHeader()))
 			if src != expectedSrc {
 				t.Errorf("got pkt source = %s, want = %s", src, expectedSrc)
 			}
@@ -251,6 +251,7 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 			SrcAddr:     remoteIPv4Addr,
 			DstAddr:     dst,
 		})
+		ip.SetChecksum(^ip.CalculateChecksum())
 
 		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: hdr.View().ToVectorisedView(),
@@ -379,8 +380,8 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID, e); err != nil {
@@ -430,7 +431,126 @@ func TestIncomingMulticastAndBroadcast(t *testing.T) {
 				}
 			} else {
 				if err != tcpip.ErrWouldBlock {
-					t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+					t.Fatalf("got Read(nil) = (%x, _, %s), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
+
+// TestReuseAddrAndBroadcast makes sure broadcast packets are received by all
+// interested endpoints.
+func TestReuseAddrAndBroadcast(t *testing.T) {
+	const (
+		nicID             = 1
+		localPort         = 9000
+		loopbackBroadcast = tcpip.Address("\x7f\xff\xff\xff")
+	)
+
+	data := tcpip.SlicePayload([]byte{1, 2, 3, 4})
+
+	tests := []struct {
+		name          string
+		broadcastAddr tcpip.Address
+	}{
+		{
+			name:          "Subnet directed broadcast",
+			broadcastAddr: loopbackBroadcast,
+		},
+		{
+			name:          "IPv4 broadcast",
+			broadcastAddr: header.IPv4Broadcast,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+			})
+			if err := s.CreateNIC(nicID, loopback.New()); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			protoAddr := tcpip.ProtocolAddress{
+				Protocol: header.IPv4ProtocolNumber,
+				AddressWithPrefix: tcpip.AddressWithPrefix{
+					Address:   "\x7f\x00\x00\x01",
+					PrefixLen: 8,
+				},
+			}
+			if err := s.AddProtocolAddress(nicID, protoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, protoAddr, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					// We use the empty subnet instead of just the loopback subnet so we
+					// also have a route to the IPv4 Broadcast address.
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// We create endpoints that bind to both the wildcard address and the
+			// broadcast address to make sure both of these types of "broadcast
+			// interested" endpoints receive broadcast packets.
+			wq := waiter.Queue{}
+			var eps []tcpip.Endpoint
+			for _, bindWildcard := range []bool{false, true} {
+				// Create multiple endpoints for each type of "broadcast interested"
+				// endpoint so we can test that all endpoints receive the broadcast
+				// packet.
+				for i := 0; i < 2; i++ {
+					ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+					if err != nil {
+						t.Fatalf("(eps[%d]) NewEndpoint(%d, %d, _): %s", len(eps), udp.ProtocolNumber, ipv4.ProtocolNumber, err)
+					}
+					defer ep.Close()
+
+					if err := ep.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+						t.Fatalf("eps[%d].SetSockOptBool(tcpip.ReuseAddressOption, true): %s", len(eps), err)
+					}
+
+					if err := ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+						t.Fatalf("eps[%d].SetSockOptBool(tcpip.BroadcastOption, true): %s", len(eps), err)
+					}
+
+					bindAddr := tcpip.FullAddress{Port: localPort}
+					if bindWildcard {
+						if err := ep.Bind(bindAddr); err != nil {
+							t.Fatalf("eps[%d].Bind(%+v): %s", len(eps), bindAddr, err)
+						}
+					} else {
+						bindAddr.Addr = test.broadcastAddr
+						if err := ep.Bind(bindAddr); err != nil {
+							t.Fatalf("eps[%d].Bind(%+v): %s", len(eps), bindAddr, err)
+						}
+					}
+
+					eps = append(eps, ep)
+				}
+			}
+
+			for i, wep := range eps {
+				writeOpts := tcpip.WriteOptions{
+					To: &tcpip.FullAddress{
+						Addr: test.broadcastAddr,
+						Port: localPort,
+					},
+				}
+				if n, _, err := wep.Write(data, writeOpts); err != nil {
+					t.Fatalf("eps[%d].Write(_, _): %s", i, err)
+				} else if want := int64(len(data)); n != want {
+					t.Fatalf("got eps[%d].Write(_, _) = (%d, nil, nil), want = (%d, nil, nil)", i, n, want)
+				}
+
+				for j, rep := range eps {
+					if gotPayload, _, err := rep.Read(nil); err != nil {
+						t.Errorf("(eps[%d] write) eps[%d].Read(nil): %s", i, j, err)
+					} else if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+						t.Errorf("(eps[%d] write) got UDP payload from eps[%d] mismatch (-want +got):\n%s", i, j, diff)
+					}
 				}
 			}
 		})
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index f32d58091..606363567 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.16
+// +build !go1.17
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index bd6f49eb8..a17234946 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -74,6 +74,8 @@ type endpoint struct {
 	route         stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -343,10 +345,15 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
 		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 	}
 	return nil
 }
@@ -371,7 +378,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
-	case tcpip.KeepaliveEnabledOption:
+	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
 		return false, nil
 
 	default:
@@ -415,9 +422,12 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		*o = e.linger
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -436,6 +446,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpi
 	pkt.Owner = owner
 
 	icmpv4 := header.ICMPv4(pkt.TransportHeader().Push(header.ICMPv4MinimumSize))
+	pkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
 	copy(icmpv4, data)
 	// Set the ident to the user-specified port. Sequence number should
 	// already be set by the user.
@@ -468,6 +479,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 
 	icmpv6 := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6MinimumSize))
+	pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
 	copy(icmpv6, data)
 	// Set the ident. Sequence number is provided by the user.
 	icmpv6.SetIdent(ident)
@@ -603,7 +615,7 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -836,3 +848,8 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+// LastError implements tcpip.Endpoint.LastError.
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 74ef6541e..87d510f96 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -13,12 +13,7 @@
 // limitations under the License.
 
 // Package icmp contains the implementation of the ICMP and IPv6-ICMP transport
-// protocols for use in ping. To use it in the networking stack, this package
-// must be added to the project, and activated on the stack by passing
-// icmp.NewProtocol4() and/or icmp.NewProtocol6() as one of the transport
-// protocols when calling stack.New(). Then endpoints can be created by passing
-// icmp.ProtocolNumber or icmp.ProtocolNumber6 as the transport protocol number
-// when calling Stack.NewEndpoint().
+// protocols for use in ping.
 package icmp
 
 import (
@@ -42,6 +37,8 @@ const (
 
 // protocol implements stack.TransportProtocol.
 type protocol struct {
+	stack *stack.Stack
+
 	number tcpip.TransportProtocolNumber
 }
 
@@ -62,20 +59,20 @@ func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
 
 // NewEndpoint creates a new icmp endpoint. It implements
 // stack.TransportProtocol.NewEndpoint.
-func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != p.netProto() {
 		return nil, tcpip.ErrUnknownProtocol
 	}
-	return newEndpoint(stack, netProto, p.number, waiterQueue)
+	return newEndpoint(p.stack, netProto, p.number, waiterQueue)
 }
 
 // NewRawEndpoint creates a new raw icmp endpoint. It implements
 // stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != p.netProto() {
 		return nil, tcpip.ErrUnknownProtocol
 	}
-	return raw.NewEndpoint(stack, netProto, p.number, waiterQueue)
+	return raw.NewEndpoint(p.stack, netProto, p.number, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid icmp packet size.
@@ -104,17 +101,17 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
-	return true
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
+	return stack.UnknownDestinationPacketHandled
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -135,11 +132,11 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
 }
 
 // NewProtocol4 returns an ICMPv4 transport protocol.
-func NewProtocol4() stack.TransportProtocol {
-	return &protocol{ProtocolNumber4}
+func NewProtocol4(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s, number: ProtocolNumber4}
 }
 
 // NewProtocol6 returns an ICMPv6 transport protocol.
-func NewProtocol6() stack.TransportProtocol {
-	return &protocol{ProtocolNumber6}
+func NewProtocol6(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s, number: ProtocolNumber6}
 }
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 1b03ad6bb..31831a6d8 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -83,6 +83,8 @@ type endpoint struct {
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
 	bound         bool
 	boundNIC      tcpip.NICID
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// lastErrorMu protects lastError.
 	lastErrorMu sync.Mutex   `state:"nosave"`
@@ -192,13 +194,13 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes
 	return ep.ReadPacket(addr, nil)
 }
 
-func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
 	// TODO(gvisor.dev/issue/173): Implement.
 	return 0, nil, tcpip.ErrInvalidOptionValue
 }
 
 // Peek implements tcpip.Endpoint.Peek.
-func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (*endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 	return 0, tcpip.ControlMessages{}, nil
 }
 
@@ -210,25 +212,25 @@ func (*endpoint) Disconnect() *tcpip.Error {
 
 // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
 // connected, and this function always returnes tcpip.ErrNotSupported.
-func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+func (*endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
 // with Shutdown, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+func (*endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
 // Listen, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
 // Accept, and this function always returns tcpip.ErrNotSupported.
-func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -267,12 +269,12 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
@@ -297,9 +299,15 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
-func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
+		return nil
+
+	case *tcpip.LingerOption:
+		ep.mu.Lock()
+		ep.linger = *v
+		ep.mu.Unlock()
 		return nil
 
 	default:
@@ -356,7 +364,7 @@ func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	}
 }
 
-func (ep *endpoint) takeLastError() *tcpip.Error {
+func (ep *endpoint) LastError() *tcpip.Error {
 	ep.lastErrorMu.Lock()
 	defer ep.lastErrorMu.Unlock()
 
@@ -366,17 +374,27 @@ func (ep *endpoint) takeLastError() *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
-		return ep.takeLastError()
+func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		ep.mu.Lock()
+		*o = ep.linger
+		ep.mu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrNotSupported
 	}
-	return tcpip.ErrNotSupported
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrNotSupported
+func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.AcceptConnOption:
+		return false, nil
+	default:
+		return false, tcpip.ErrNotSupported
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -512,7 +530,7 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress,
 }
 
 // State implements socket.Socket.State.
-func (ep *endpoint) State() uint32 {
+func (*endpoint) State() uint32 {
 	return 0
 }
 
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index edc2b5b61..79f688129 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -84,6 +84,8 @@ type endpoint struct {
 	// Connect(), and is valid only when conneted is true.
 	route stack.Route                  `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -446,12 +448,12 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 }
 
 // Listen implements tcpip.Endpoint.Listen.
-func (e *endpoint) Listen(backlog int) *tcpip.Error {
+func (*endpoint) Listen(backlog int) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
 // Accept implements tcpip.Endpoint.Accept.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
@@ -482,12 +484,12 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
-func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{}, tcpip.ErrNotSupported
 }
 
 // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
-func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	// Even a connected socket doesn't return a remote address.
 	return tcpip.FullAddress{}, tcpip.ErrNotConnected
 }
@@ -510,9 +512,15 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.SocketDetachFilterOption:
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+	switch v := opt.(type) {
+	case *tcpip.SocketDetachFilterOption:
+		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -577,9 +585,12 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch opt.(type) {
-	case tcpip.ErrorOption:
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+	switch o := opt.(type) {
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		*o = e.linger
+		e.mu.Unlock()
 		return nil
 
 	default:
@@ -590,7 +601,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
-	case tcpip.KeepaliveEnabledOption:
+	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
 		return false, nil
 
 	case tcpip.IPHdrIncludedOption:
@@ -739,3 +750,7 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
+
+func (*endpoint) LastError() *tcpip.Error {
+	return nil
+}
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index 33bfb56cd..7d97cbdc7 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -37,57 +37,57 @@ func (p *rawPacket) loadData(data buffer.VectorisedView) {
 }
 
 // beforeSave is invoked by stateify.
-func (ep *endpoint) beforeSave() {
+func (e *endpoint) beforeSave() {
 	// Stop incoming packets from being handled (and mutate endpoint state).
 	// The lock will be released after saveRcvBufSizeMax(), which would have
-	// saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
 	// packets.
-	ep.rcvMu.Lock()
+	e.rcvMu.Lock()
 }
 
 // saveRcvBufSizeMax is invoked by stateify.
-func (ep *endpoint) saveRcvBufSizeMax() int {
-	max := ep.rcvBufSizeMax
+func (e *endpoint) saveRcvBufSizeMax() int {
+	max := e.rcvBufSizeMax
 	// Make sure no new packets will be handled regardless of the lock.
-	ep.rcvBufSizeMax = 0
+	e.rcvBufSizeMax = 0
 	// Release the lock acquired in beforeSave() so regular endpoint closing
 	// logic can proceed after save.
-	ep.rcvMu.Unlock()
+	e.rcvMu.Unlock()
 	return max
 }
 
 // loadRcvBufSizeMax is invoked by stateify.
-func (ep *endpoint) loadRcvBufSizeMax(max int) {
-	ep.rcvBufSizeMax = max
+func (e *endpoint) loadRcvBufSizeMax(max int) {
+	e.rcvBufSizeMax = max
 }
 
 // afterLoad is invoked by stateify.
-func (ep *endpoint) afterLoad() {
-	stack.StackFromEnv.RegisterRestoredEndpoint(ep)
+func (e *endpoint) afterLoad() {
+	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
 // Resume implements tcpip.ResumableEndpoint.Resume.
-func (ep *endpoint) Resume(s *stack.Stack) {
-	ep.stack = s
+func (e *endpoint) Resume(s *stack.Stack) {
+	e.stack = s
 
 	// If the endpoint is connected, re-connect.
-	if ep.connected {
+	if e.connected {
 		var err *tcpip.Error
-		ep.route, err = ep.stack.FindRoute(ep.RegisterNICID, ep.BindAddr, ep.route.RemoteAddress, ep.NetProto, false)
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false)
 		if err != nil {
 			panic(err)
 		}
 	}
 
 	// If the endpoint is bound, re-bind.
-	if ep.bound {
-		if ep.stack.CheckLocalAddress(ep.RegisterNICID, ep.NetProto, ep.BindAddr) == 0 {
+	if e.bound {
+		if e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, e.BindAddr) == 0 {
 			panic(tcpip.ErrBadLocalAddress)
 		}
 	}
 
-	if ep.associated {
-		if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
+	if e.associated {
+		if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil {
 			panic(err)
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index bde071f2a..518449602 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -11,8 +11,7 @@ go_template_instance(
     template = "//pkg/ilist:generic_list",
     types = {
         "Element": "*segment",
-        "ElementMapper": "segmentMapper",
-        "Linker": "*segmentEntry",
+        "Linker": "*segment",
     },
 )
 
@@ -28,19 +27,6 @@ go_template_instance(
     },
 )
 
-go_template_instance(
-    name = "tcp_rack_segment_list",
-    out = "tcp_rack_segment_list.go",
-    package = "tcp",
-    prefix = "rackSegment",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*segment",
-        "ElementMapper": "rackSegmentMapper",
-        "Linker": "*rackSegmentEntry",
-    },
-)
-
 go_library(
     name = "tcp",
     srcs = [
@@ -69,7 +55,6 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_endpoint_list.go",
-        "tcp_rack_segment_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
@@ -84,6 +69,7 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
@@ -108,6 +94,7 @@ go_test(
     shard_count = 10,
     deps = [
         ":tcp",
+        "//pkg/rand",
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index b706438bd..6b3238d6b 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -425,20 +425,17 @@ func (e *endpoint) notifyAborted() {
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
 	defer ctx.synRcvdCount.dec()
-	defer func() {
-		e.mu.Lock()
-		e.decSynRcvdCount()
-		e.mu.Unlock()
-	}()
 	defer s.decRef()
 
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}, e.owner)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
+		e.decSynRcvdCount()
 		return
 	}
 	ctx.removePendingEndpoint(n)
+	e.decSynRcvdCount()
 	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
@@ -456,7 +453,9 @@ func (e *endpoint) incSynRcvdCount() bool {
 }
 
 func (e *endpoint) decSynRcvdCount() {
+	e.mu.Lock()
 	e.synRcvdCount--
+	e.mu.Unlock()
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 87980c0a1..0aaef495d 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -491,7 +491,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 		}
 
@@ -522,7 +522,7 @@ func (h *handshake) execute() *tcpip.Error {
 	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
 	defer s.Done()
 
-	var sackEnabled SACKEnabled
+	var sackEnabled tcpip.TCPSACKEnabled
 	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
 		// If stack returned an error when checking for SACKEnabled
 		// status then just default to switching off SACK negotiation.
@@ -620,7 +620,7 @@ func (h *handshake) execute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.takeLastError()
+				return h.ep.LastError()
 			}
 
 		case wakerForNewSegment:
@@ -747,6 +747,7 @@ func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedV
 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	optLen := len(tf.opts)
 	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
+	pkt.TransportProtocolNumber = header.TCPProtocolNumber
 	tcp.Encode(&header.TCPFields{
 		SrcPort:    tf.id.LocalPort,
 		DstPort:    tf.id.RemotePort,
@@ -803,7 +804,7 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 		pkt.Owner = owner
 		pkt.EgressRoute = r
 		pkt.GSOOptions = gso
-		pkt.NetworkProtocolNumber = r.NetworkProtocolNumber()
+		pkt.NetworkProtocolNumber = r.NetProto
 		data.ReadToVV(&pkt.Data, packetSize)
 		buildTCPHdr(r, tf, pkt, gso)
 		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
@@ -897,7 +898,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -924,18 +925,7 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 
 	first := e.sndQueue.Front()
 	if first != nil {
-		lastSeg := e.snd.writeList.Back()
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		if lastSeg == nil {
-			lastSeg = e.snd.writeList.Front()
-		} else {
-			lastSeg = lastSeg.segEntry.Next()
-		}
-		// Add new segments to rcList, as rcList and writeList should
-		// be consistent.
-		for seg := lastSeg; seg != nil; seg = seg.segEntry.Next() {
-			e.snd.rcList.PushBack(seg)
-		}
 		e.sndBufInQueue = 0
 	}
 
@@ -1013,9 +1003,8 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 	// (indicated by a negative send window scale).
 	e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
 
-	rcvBufSize := seqnum.Size(e.receiveBufferSize())
 	e.rcvListMu.Lock()
-	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
 	// Bootstrap the auto tuning algorithm. Starting at zero will
 	// result in a really large receive window after the first auto
 	// tuning adjustment.
@@ -1146,12 +1135,11 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 		}
 
 		cont, err := e.handleSegment(s)
+		s.decRef()
 		if err != nil {
-			s.decRef()
 			return err
 		}
 		if !cont {
-			s.decRef()
 			return nil
 		}
 	}
@@ -1243,7 +1231,6 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 			// or a notification from the protocolMainLoop (caller goroutine).
 			// This means that with this return, the segment dequeue below can
 			// never occur on a closed endpoint.
-			s.decRef()
 			return false, nil
 		}
 
@@ -1435,10 +1422,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					e.rcv.nonZeroWindow()
 				}
 
-				if n&notifyReceiveWindowChanged != 0 {
-					e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
-				}
-
 				if n&notifyMTUChanged != 0 {
 					e.sndBufMu.Lock()
 					count := e.packetTooBigCount
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 804e95aea..560b4904c 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -78,16 +78,15 @@ func testV4Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	ackCheckers := append(checkers, checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS)+1),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS)+1),
+		checker.TCPAckNum(uint32(iss)+1),
 	))
 	checker.IPv4(t, c.GetPacket(), ackCheckers...)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -186,16 +185,15 @@ func testV6Connect(t *testing.T, c *context.Context, checkers ...checker.Network
 	ackCheckers := append(checkers, checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS)+1),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS)+1),
+		checker.TCPAckNum(uint32(iss)+1),
 	))
 	checker.IPv6(t, c.GetV6Packet(), ackCheckers...)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -285,7 +283,7 @@ func TestV4RefuseOnV6Only(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 }
@@ -321,7 +319,7 @@ func TestV6RefuseOnBoundToV4Mapped(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 }
@@ -354,7 +352,7 @@ func testV4Accept(t *testing.T, c *context.Context) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 
@@ -373,12 +371,12 @@ func testV4Accept(t *testing.T, c *context.Context) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -494,7 +492,7 @@ func TestV6AcceptOnV6(t *testing.T) {
 			checker.SrcPort(context.StackPort),
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-			checker.AckNum(uint32(irs)+1),
+			checker.TCPAckNum(uint32(irs)+1),
 		),
 	)
 
@@ -512,13 +510,13 @@ func TestV6AcceptOnV6(t *testing.T) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-
-	nep, _, err := c.EP.Accept()
+	var addr tcpip.FullAddress
+	nep, _, err := c.EP.Accept(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(&addr)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -528,20 +526,14 @@ func TestV6AcceptOnV6(t *testing.T) {
 		}
 	}
 
+	if addr.Addr != context.TestV6Addr {
+		t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr)
+	}
+
 	// Make sure we can still query the v6 only status of the new endpoint,
 	// that is, that it is in fact a v6 socket.
 	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
-		t.Fatalf("GetSockOpt failed failed: %v", err)
-	}
-
-	// Check the peer address.
-	addr, err := nep.GetRemoteAddress()
-	if err != nil {
-		t.Fatalf("GetRemoteAddress failed failed: %v", err)
-	}
-
-	if addr.Addr != context.TestV6Addr {
-		t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestV6Addr)
+		t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err)
 	}
 }
 
@@ -568,8 +560,9 @@ func TestV4AcceptOnV4(t *testing.T) {
 func testV4ListenClose(t *testing.T, c *context.Context) {
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	var opt tcpip.TCPSynRcvdCountThresholdOption
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	const n = uint16(32)
@@ -612,12 +605,12 @@ func testV4ListenClose(t *testing.T, c *context.Context) {
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
-	nep, _, err := c.EP.Accept()
+	nep, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept()
+			nep, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 21a4b6e2f..c826942e9 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -63,6 +63,17 @@ const (
 	StateClosing
 )
 
+const (
+	// rcvAdvWndScale is used to split the available socket buffer into
+	// application buffer and the window to be advertised to the peer. This is
+	// currently hard coded to split the available space equally.
+	rcvAdvWndScale = 1
+
+	// SegOverheadFactor is used to multiply the value provided by the
+	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
+	SegOverheadFactor = 2
+)
+
 // connected returns true when s is one of the states representing an
 // endpoint connected to a peer.
 func (s EndpointState) connected() bool {
@@ -149,7 +160,6 @@ func (s EndpointState) String() string {
 // Reasons for notifying the protocol goroutine.
 const (
 	notifyNonZeroReceiveWindow = 1 << iota
-	notifyReceiveWindowChanged
 	notifyClose
 	notifyMTUChanged
 	notifyDrain
@@ -238,6 +248,11 @@ type ReceiveErrors struct {
 	// ZeroRcvWindowState is the number of times we advertised
 	// a zero receive window when rcvList is full.
 	ZeroRcvWindowState tcpip.StatCounter
+
+	// WantZeroWindow is the number of times we wanted to advertise a
+	// zero receive window but couldn't because it would have caused
+	// the receive window's right edge to shrink.
+	WantZeroRcvWindow tcpip.StatCounter
 }
 
 // SendErrors collect segment send errors within the transport layer.
@@ -384,13 +399,26 @@ type endpoint struct {
 	// to indicate to users that no more data is coming.
 	//
 	// rcvListMu can be taken after the endpoint mu below.
-	rcvListMu     sync.Mutex  `state:"nosave"`
-	rcvList       segmentList `state:"wait"`
-	rcvClosed     bool
-	rcvBufSize    int
+	rcvListMu sync.Mutex  `state:"nosave"`
+	rcvList   segmentList `state:"wait"`
+	rcvClosed bool
+	// rcvBufSize is the total size of the receive buffer.
+	rcvBufSize int
+	// rcvBufUsed is the actual number of payload bytes held in the receive buffer
+	// not counting any overheads of the segments itself. NOTE: This will always
+	// be strictly <= rcvMemUsed below.
 	rcvBufUsed    int
 	rcvAutoParams rcvBufAutoTuneParams
 
+	// rcvMemUsed tracks the total amount of memory in use by received segments
+	// held in rcvList, pendingRcvdSegments and the segment queue. This is used to
+	// compute the window and the actual available buffer space. This is distinct
+	// from rcvBufUsed above which is the actual number of payload bytes held in
+	// the buffer not including any segment overheads.
+	//
+	// rcvMemUsed must be accessed atomically.
+	rcvMemUsed int32
+
 	// mu protects all endpoint fields unless documented otherwise. mu must
 	// be acquired before interacting with the endpoint fields.
 	mu          sync.Mutex `state:"nosave"`
@@ -654,6 +682,9 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -849,12 +880,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		maxSynRetries: DefaultSynRetries,
 	}
 
-	var ss SendBufferSizeOption
+	var ss tcpip.TCPSendBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 		e.sndBufSize = ss.Default
 	}
 
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 		e.rcvBufSize = rs.Default
 	}
@@ -864,12 +895,12 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.cc = cs
 	}
 
-	var mrb tcpip.ModerateReceiveBufferOption
+	var mrb tcpip.TCPModerateReceiveBufferOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
 		e.rcvAutoParams.disabled = !bool(mrb)
 	}
 
-	var de DelayEnabled
+	var de tcpip.TCPDelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
 		e.SetSockOptBool(tcpip.DelayOption, true)
 	}
@@ -888,7 +919,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.probe = p
 	}
 
-	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	e.segmentQueue.ep = e
 	e.tsOffset = timeStampOffset()
 	e.acceptCond = sync.NewCond(&e.acceptMu)
 
@@ -901,7 +932,12 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	result := waiter.EventMask(0)
 
 	switch e.EndpointState() {
-	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
+	case StateInitial, StateBound:
+		// This prevents blocking of new sockets which are not
+		// connected when SO_LINGER is set.
+		result |= waiter.EventHUp
+
+	case StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
 	case StateClose, StateError, StateTimeWait:
@@ -1007,6 +1043,26 @@ func (e *endpoint) Close() {
 		return
 	}
 
+	if e.linger.Enabled && e.linger.Timeout == 0 {
+		s := e.EndpointState()
+		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
+		if isResetState {
+			// Close the endpoint without doing full shutdown and
+			// send a RST.
+			e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			e.closeNoShutdownLocked()
+
+			// Wake up worker to close the endpoint.
+			switch s {
+			case StateSynRecv:
+				e.notifyProtocolGoroutine(notifyClose)
+			default:
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+			return
+		}
+	}
+
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -1052,6 +1108,8 @@ func (e *endpoint) closeNoShutdownLocked() {
 		e.notifyProtocolGoroutine(notifyClose)
 	} else {
 		e.transitionToStateCloseLocked()
+		// Notify that the endpoint is closed.
+		e.waiterQueue.Notify(waiter.EventHUp)
 	}
 }
 
@@ -1106,10 +1164,16 @@ func (e *endpoint) cleanupLocked() {
 	tcpip.DeleteDanglingEndpoint(e)
 }
 
+// wndFromSpace returns the window that we can advertise based on the available
+// receive buffer space.
+func wndFromSpace(space int) int {
+	return space >> rcvAdvWndScale
+}
+
 // initialReceiveWindow returns the initial receive window to advertise in the
 // SYN/SYN-ACK.
 func (e *endpoint) initialReceiveWindow() int {
-	rcvWnd := e.receiveBufferAvailable()
+	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
 	if rcvWnd > math.MaxUint16 {
 		rcvWnd = math.MaxUint16
 	}
@@ -1186,14 +1250,12 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 		// reject valid data that might already be in flight as the
 		// acceptable window will shrink.
 		if rcvWnd > e.rcvBufSize {
-			availBefore := e.receiveBufferAvailableLocked()
+			availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
 			e.rcvBufSize = rcvWnd
-			availAfter := e.receiveBufferAvailableLocked()
-			mask := uint32(notifyReceiveWindowChanged)
+			availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
 			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
-				mask |= notifyNonZeroReceiveWindow
+				e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 			}
-			e.notifyProtocolGoroutine(mask)
 		}
 
 		// We only update prevCopied when we grow the buffer because in cases
@@ -1211,7 +1273,7 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 	err := e.lastError
@@ -1270,18 +1332,22 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	v := views[s.viewToDeliver]
 	s.viewToDeliver++
 
+	var delta int
 	if s.viewToDeliver >= len(views) {
 		e.rcvList.Remove(s)
+		// We only free up receive buffer space when the segment is released as the
+		// segment is still holding on to the views even though some views have been
+		// read out to the user.
+		delta = s.segMemSize()
 		s.decRef()
 	}
 
 	e.rcvBufUsed -= len(v)
-
 	// If the window was small before this read and if the read freed up
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(delta); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1294,14 +1360,17 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // indicating the reason why it's not writable.
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
-	// The endpoint cannot be written to if it's not connected.
-	if !e.EndpointState().connected() {
-		switch e.EndpointState() {
-		case StateError:
-			return 0, e.HardError
-		default:
-			return 0, tcpip.ErrClosedForSend
-		}
+	switch s := e.EndpointState(); {
+	case s == StateError:
+		return 0, e.HardError
+	case !s.connecting() && !s.connected():
+		return 0, tcpip.ErrClosedForSend
+	case s.connecting():
+		// As per RFC793, page 56, a send request arriving when in connecting
+		// state, can be queued to be completed after the state becomes
+		// connected. Return an error code for the caller of endpoint Write to
+		// try again, until the connection handshake is complete.
+		return 0, tcpip.ErrWouldBlock
 	}
 
 	// Check if the connection has already been closed for sends.
@@ -1428,7 +1497,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	vec = append([][]byte(nil), vec...)
 
 	var num int64
-	for s := e.rcvList.Front(); s != nil; s = s.segEntry.Next() {
+	for s := e.rcvList.Front(); s != nil; s = s.Next() {
 		views := s.data.Views()
 
 		for i := s.viewToDeliver; i < len(views); i++ {
@@ -1454,12 +1523,44 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
+// selectWindowLocked returns the new window without checking for shrinking or scaling
+// applied.
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) selectWindowLocked() (wnd seqnum.Size) {
+	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked())
+	maxWindow := wndFromSpace(e.rcvBufSize)
+	wndFromUsedBytes := maxWindow - e.rcvBufUsed
+
+	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
+	// cases where we receive a lot of small segments the segment overhead is a
+	// lot higher and we can run out socket buffer space before we can fill the
+	// previous window we advertised. In cases where we receive MSS sized or close
+	// MSS sized segments we will probably run out of window space before we
+	// exhaust receive buffer.
+	newWnd := wndFromAvailable
+	if newWnd > wndFromUsedBytes {
+		newWnd = wndFromUsedBytes
+	}
+	if newWnd < 0 {
+		newWnd = 0
+	}
+	return seqnum.Size(newWnd)
+}
+
+// selectWindow invokes selectWindowLocked after acquiring e.rcvListMu.
+func (e *endpoint) selectWindow() (wnd seqnum.Size) {
+	e.rcvListMu.Lock()
+	wnd = e.selectWindowLocked()
+	e.rcvListMu.Unlock()
+	return wnd
+}
+
 // windowCrossedACKThresholdLocked checks if the receive window to be announced
-// now would be under aMSS or under half receive buffer, whichever smaller. This
-// is useful as a receive side silly window syndrome prevention mechanism. If
-// window grows to reasonable value, we should send ACK to the sender to inform
-// the rx space is now large. We also want ensure a series of small read()'s
-// won't trigger a flood of spurious tiny ACK's.
+// would be under aMSS or under the window derived from half receive buffer,
+// whichever smaller. This is useful as a receive side silly window syndrome
+// prevention mechanism. If window grows to reasonable value, we should send ACK
+// to the sender to inform the rx space is now large. We also want ensure a
+// series of small read()'s won't trigger a flood of spurious tiny ACK's.
 //
 // For large receive buffers, the threshold is aMSS - once reader reads more
 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
@@ -1470,17 +1571,18 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 //
 // Precondition: e.mu and e.rcvListMu must be held.
 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
-	newAvail := e.receiveBufferAvailableLocked()
+	newAvail := int(e.selectWindowLocked())
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
 		oldAvail = 0
 	}
-
 	threshold := int(e.amss)
-	if threshold > e.rcvBufSize/2 {
-		threshold = e.rcvBufSize / 2
+	// rcvBufFraction is the inverse of the fraction of receive buffer size that
+	// is used to decide if the available buffer space is now above it.
+	const rcvBufFraction = 2
+	if wndThreshold := wndFromSpace(e.rcvBufSize / rcvBufFraction); threshold > wndThreshold {
+		threshold = wndThreshold
 	}
-
 	switch {
 	case oldAvail < threshold && newAvail >= threshold:
 		return true, true
@@ -1609,18 +1711,24 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
-		var rs ReceiveBufferSizeOption
-		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
+			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &rs, err))
+		}
+
+		if v > rs.Max {
+			v = rs.Max
+		}
+
+		if v < math.MaxInt32/SegOverheadFactor {
+			v *= SegOverheadFactor
 			if v < rs.Min {
 				v = rs.Min
 			}
-			if v > rs.Max {
-				v = rs.Max
-			}
+		} else {
+			v = math.MaxInt32
 		}
 
-		mask := uint32(notifyReceiveWindowChanged)
-
 		e.LockUser()
 		e.rcvListMu.Lock()
 
@@ -1634,14 +1742,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 			v = 1 << scale
 		}
 
-		// Make sure 2*size doesn't overflow.
-		if v > math.MaxInt32/2 {
-			v = math.MaxInt32 / 2
-		}
-
-		availBefore := e.receiveBufferAvailableLocked()
+		availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
 		e.rcvBufSize = v
-		availAfter := e.receiveBufferAvailableLocked()
+		availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
 
 		e.rcvAutoParams.disabled = true
 
@@ -1649,24 +1752,31 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
 		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
-			mask |= notifyNonZeroReceiveWindow
+			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 		}
 
 		e.rcvListMu.Unlock()
 		e.UnlockUser()
-		e.notifyProtocolGoroutine(mask)
 
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
 		// allowed.
-		var ss SendBufferSizeOption
-		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+		var ss tcpip.TCPSendBufferSizeRangeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err != nil {
+			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &ss, err))
+		}
+
+		if v > ss.Max {
+			v = ss.Max
+		}
+
+		if v < math.MaxInt32/SegOverheadFactor {
+			v *= SegOverheadFactor
 			if v < ss.Min {
 				v = ss.Min
 			}
-			if v > ss.Max {
-				v = ss.Max
-			}
+		} else {
+			v = math.MaxInt32
 		}
 
 		e.sndBufMu.Lock()
@@ -1699,7 +1809,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 				return tcpip.ErrInvalidOptionValue
 			}
 		}
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if v < rs.Min/2 {
 				v = rs.Min / 2
@@ -1713,10 +1823,10 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt sets a socket option.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -1724,40 +1834,40 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.UnlockUser()
 
-	case tcpip.KeepaliveIdleOption:
+	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
-		e.keepalive.idle = time.Duration(v)
+		e.keepalive.idle = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.KeepaliveIntervalOption:
+	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
-		e.keepalive.interval = time.Duration(v)
+		e.keepalive.interval = time.Duration(*v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case tcpip.OutOfBandInlineOption:
+	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 
-	case tcpip.TCPUserTimeoutOption:
+	case *tcpip.TCPUserTimeoutOption:
 		e.LockUser()
-		e.userTimeout = time.Duration(v)
+		e.userTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		// Query the available cc algorithms in the stack and
 		// validate that the specified algorithm is actually
 		// supported in the stack.
-		var avail tcpip.AvailableCongestionControlOption
+		var avail tcpip.TCPAvailableCongestionControlOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
 			return err
 		}
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
-			if v == tcpip.CongestionControlOption(cc) {
+			if *v == tcpip.CongestionControlOption(cc) {
 				e.LockUser()
 				state := e.EndpointState()
-				e.cc = v
+				e.cc = *v
 				switch state {
 				case StateEstablished:
 					if e.EndpointState() == state {
@@ -1773,31 +1883,45 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		// control algorithm is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.TCPLingerTimeoutOption:
+	case *tcpip.TCPLingerTimeoutOption:
 		e.LockUser()
-		if v < 0 {
+
+		switch {
+		case *v < 0:
 			// Same as effectively disabling TCPLinger timeout.
-			v = 0
-		}
-		// Cap it to MaxTCPLingerTimeout.
-		stkTCPLingerTimeout := tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
-		if v > stkTCPLingerTimeout {
-			v = stkTCPLingerTimeout
+			*v = -1
+		case *v == 0:
+			// Same as the stack default.
+			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
+			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
+				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
+			}
+			*v = stackLingerTimeout
+		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
+			// Cap it to Stack's default TCP_LINGER2 timeout.
+			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
+		default:
 		}
-		e.tcpLingerTimeout = time.Duration(v)
+
+		e.tcpLingerTimeout = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.TCPDeferAcceptOption:
+	case *tcpip.TCPDeferAcceptOption:
 		e.LockUser()
-		if time.Duration(v) > MaxRTO {
-			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		if time.Duration(*v) > MaxRTO {
+			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
-		e.deferAccept = time.Duration(v)
+		e.deferAccept = time.Duration(*v)
 		e.UnlockUser()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
 
+	case *tcpip.LingerOption:
+		e.LockUser()
+		e.linger = *v
+		e.UnlockUser()
+
 	default:
 		return nil
 	}
@@ -1875,6 +1999,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	case tcpip.MulticastLoopOption:
 		return true, nil
 
+	case tcpip.AcceptConnOption:
+		e.LockUser()
+		defer e.UnlockUser()
+
+		return e.EndpointState() == StateListen, nil
+
 	default:
 		return false, tcpip.ErrUnknownProtocolOption
 	}
@@ -1956,11 +2086,8 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
-
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
@@ -2013,8 +2140,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.UnlockUser()
 
 	case *tcpip.OriginalDestinationOption:
+		e.LockUser()
 		ipt := e.stack.IPTables()
-		addr, port, err := ipt.OriginalDst(e.ID)
+		addr, port, err := ipt.OriginalDst(e.ID, e.NetProto)
+		e.UnlockUser()
 		if err != nil {
 			return err
 		}
@@ -2023,6 +2152,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			Port: port,
 		}
 
+	case *tcpip.LingerOption:
+		e.LockUser()
+		*o = e.linger
+		e.UnlockUser()
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2169,7 +2303,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			if sameAddr && p == e.ID.RemotePort {
 				return false, nil
 			}
-			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 				if err != tcpip.ErrPortInUse || !reuse {
 					return false, nil
 				}
@@ -2207,7 +2341,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 				tcpEP.notifyProtocolGoroutine(notifyAbort)
 				tcpEP.UnlockUser()
 				// Now try and Reserve again if it fails then we skip.
-				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr, nil /* testPort */); err != nil {
 					return false, nil
 				}
 			}
@@ -2249,7 +2383,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	if !handshake {
 		e.segmentQueue.mu.Lock()
 		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
-			for s := l.Front(); s != nil; s = s.segEntry.Next() {
+			for s := l.Front(); s != nil; s = s.Next() {
 				s.id = e.ID
 				s.route = r.Clone()
 				e.sndWaker.Assert()
@@ -2447,7 +2581,9 @@ func (e *endpoint) startAcceptedLoop() {
 
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
-func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+//
+// addr if not-nil will contain the peer address of the returned endpoint.
+func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
@@ -2469,6 +2605,9 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
+	if peerAddr != nil {
+		*peerAddr = n.getRemoteAddress()
+	}
 	return n, n.waiterQueue, nil
 }
 
@@ -2505,47 +2644,45 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
-	if err != nil {
-		return err
-	}
-
-	e.boundBindToDevice = e.bindToDevice
-	e.boundPortFlags = e.portFlags
-	e.isPortReserved = true
-	e.effectiveNetProtos = netProtos
-	e.ID.LocalPort = port
-
-	// Any failures beyond this point must remove the port registration.
-	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
-		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice, tcpip.FullAddress{})
-			e.isPortReserved = false
-			e.effectiveNetProtos = nil
-			e.ID.LocalPort = 0
-			e.ID.LocalAddress = ""
-			e.boundNICID = 0
-			e.boundBindToDevice = 0
-			e.boundPortFlags = ports.Flags{}
-		}
-	}(e.boundPortFlags, e.boundBindToDevice)
-
+	var nic tcpip.NICID
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
 	if len(addr.Addr) != 0 {
-		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
 		if nic == 0 {
 			return tcpip.ErrBadLocalAddress
 		}
-
-		e.boundNICID = nic
 		e.ID.LocalAddress = addr.Addr
 	}
 
-	if err := e.stack.CheckRegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e.boundPortFlags, e.boundBindToDevice); err != nil {
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, func(p uint16) bool {
+		id := e.ID
+		id.LocalPort = p
+		// CheckRegisterTransportEndpoint should only return an error if there is a
+		// listening endpoint bound with the same id and portFlags and bindToDevice
+		// options.
+		//
+		// NOTE: Only listening and connected endpoint register with
+		// demuxer. Further connected endpoints always have a remote
+		// address/port. Hence this will only return an error if there is a matching
+		// listening endpoint.
+		if err := e.stack.CheckRegisterTransportEndpoint(nic, netProtos, ProtocolNumber, id, e.portFlags, e.bindToDevice); err != nil {
+			return false
+		}
+		return true
+	})
+	if err != nil {
 		return err
 	}
 
+	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = e.portFlags
+	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
+	e.boundNICID = nic
+	e.isPortReserved = true
+	e.effectiveNetProtos = netProtos
+	e.ID.LocalPort = port
+
 	// Mark endpoint as bound.
 	e.setEndpointState(StateBound)
 
@@ -2573,11 +2710,15 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
+	return e.getRemoteAddress(), nil
+}
+
+func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
 	return tcpip.FullAddress{
 		Addr: e.ID.RemoteAddress,
 		Port: e.ID.RemotePort,
 		NIC:  e.boundNICID,
-	}, nil
+	}
 }
 
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -2648,13 +2789,8 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 func (e *endpoint) readyToRead(s *segment) {
 	e.rcvListMu.Lock()
 	if s != nil {
+		e.rcvBufUsed += s.payloadSize()
 		s.incRef()
-		e.rcvBufUsed += s.data.Size()
-		// Increase counter if the receive window falls down below MSS
-		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
-			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
-		}
 		e.rcvList.PushBack(s)
 	} else {
 		e.rcvClosed = true
@@ -2669,15 +2805,17 @@ func (e *endpoint) readyToRead(s *segment) {
 func (e *endpoint) receiveBufferAvailableLocked() int {
 	// We may use more bytes than the buffer size when the receive buffer
 	// shrinks.
-	if e.rcvBufUsed >= e.rcvBufSize {
+	memUsed := e.receiveMemUsed()
+	if memUsed >= e.rcvBufSize {
 		return 0
 	}
 
-	return e.rcvBufSize - e.rcvBufUsed
+	return e.rcvBufSize - memUsed
 }
 
 // receiveBufferAvailable calculates how many bytes are still available in the
-// receive buffer.
+// receive buffer based on the actual memory used by all segments held in
+// receive buffer/pending and segment queue.
 func (e *endpoint) receiveBufferAvailable() int {
 	e.rcvListMu.Lock()
 	available := e.receiveBufferAvailableLocked()
@@ -2685,16 +2823,37 @@ func (e *endpoint) receiveBufferAvailable() int {
 	return available
 }
 
+// receiveBufferUsed returns the amount of in-use receive buffer.
+func (e *endpoint) receiveBufferUsed() int {
+	e.rcvListMu.Lock()
+	used := e.rcvBufUsed
+	e.rcvListMu.Unlock()
+	return used
+}
+
+// receiveBufferSize returns the current size of the receive buffer.
 func (e *endpoint) receiveBufferSize() int {
 	e.rcvListMu.Lock()
 	size := e.rcvBufSize
 	e.rcvListMu.Unlock()
-
 	return size
 }
 
+// receiveMemUsed returns the total memory in use by segments held by this
+// endpoint.
+func (e *endpoint) receiveMemUsed() int {
+	return int(atomic.LoadInt32(&e.rcvMemUsed))
+}
+
+// updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
+func (e *endpoint) updateReceiveMemUsed(delta int) {
+	atomic.AddInt32(&e.rcvMemUsed, int32(delta))
+}
+
+// maxReceiveBufferSize returns the stack wide maximum receive buffer size for
+// an endpoint.
 func (e *endpoint) maxReceiveBufferSize() int {
-	var rs ReceiveBufferSizeOption
+	var rs tcpip.TCPReceiveBufferSizeRangeOption
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
 		// As a fallback return the hardcoded max buffer size.
 		return MaxBufferSize
@@ -2774,7 +2933,7 @@ func timeStampOffset() uint32 {
 // if the SYN options indicate that the SACK option was negotiated and the TCP
 // stack is configured to enable TCP SACK option.
 func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
-	var v SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return
@@ -2843,7 +3002,6 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 		RcvAcc:         e.rcv.rcvAcc,
 		RcvWndScale:    e.rcv.rcvWndScale,
 		PendingBufUsed: e.rcv.pendingBufUsed,
-		PendingBufSize: e.rcv.pendingBufSize,
 	}
 
 	// Copy sender state.
@@ -2898,6 +3056,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 		EndSequence: rc.endSequence,
 		FACK:        rc.fack,
 		RTT:         rc.rtt,
+		Reord:       rc.reorderSeen,
 	}
 	return s
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 723e47ddc..b25431467 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -44,7 +44,7 @@ func (e *endpoint) drainSegmentLocked() {
 // beforeSave is invoked by stateify.
 func (e *endpoint) beforeSave() {
 	// Stop incoming packets.
-	e.segmentQueue.setLimit(0)
+	e.segmentQueue.freeze()
 
 	e.mu.Lock()
 	defer e.mu.Unlock()
@@ -178,18 +178,18 @@ func (e *endpoint) afterLoad() {
 // Resume implements tcpip.ResumableEndpoint.Resume.
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
-	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	e.segmentQueue.thaw()
 	epState := e.origEndpointState
 	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
-		var ss SendBufferSizeOption
+		var ss tcpip.TCPSendBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
 				panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max))
 			}
 		}
 
-		var rs ReceiveBufferSizeOption
+		var rs tcpip.TCPReceiveBufferSizeRangeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
 			if e.rcvBufSize < rs.Min || e.rcvBufSize > rs.Max {
 				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, rs.Min, rs.Max))
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c5afa2680..5bce73605 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -12,12 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package tcp contains the implementation of the TCP transport protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing tcp.NewProtocol() as one of the
-// transport protocols when calling stack.New(). Then endpoints can be created
-// by passing tcp.ProtocolNumber as the transport protocol number when calling
-// Stack.NewEndpoint().
+// Package tcp contains the implementation of the TCP transport protocol.
 package tcp
 
 import (
@@ -29,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
@@ -79,50 +75,6 @@ const (
 	ccCubic = "cubic"
 )
 
-// SACKEnabled is used by stack.(*Stack).TransportProtocolOption to
-// enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018.
-type SACKEnabled bool
-
-// Recovery is used by stack.(*Stack).TransportProtocolOption to
-// set loss detection algorithm in TCP.
-type Recovery int32
-
-const (
-	// RACKLossDetection indicates RACK is used for loss detection and
-	// recovery.
-	RACKLossDetection Recovery = 1 << iota
-
-	// RACKStaticReoWnd indicates the reordering window should not be
-	// adjusted when DSACK is received.
-	RACKStaticReoWnd
-
-	// RACKNoDupTh indicates RACK should not consider the classic three
-	// duplicate acknowledgements rule to mark the segments as lost. This
-	// is used when reordering is not detected.
-	RACKNoDupTh
-)
-
-// DelayEnabled is used by stack.(Stack*).TransportProtocolOption to
-// enable/disable Nagle's algorithm in TCP.
-type DelayEnabled bool
-
-// SendBufferSizeOption is used by stack.(Stack*).TransportProtocolOption
-// to get/set the default, min and max TCP send buffer sizes.
-type SendBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
-// ReceiveBufferSizeOption is used by
-// stack.(Stack*).TransportProtocolOption to get/set the default, min and max
-// TCP receive buffer sizes.
-type ReceiveBufferSizeOption struct {
-	Min     int
-	Default int
-	Max     int
-}
-
 // syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
 // value is protected by a mutex so that we can increment only when it's
 // guaranteed not to go above a threshold.
@@ -181,12 +133,14 @@ func (s *synRcvdCounter) Threshold() uint64 {
 }
 
 type protocol struct {
+	stack *stack.Stack
+
 	mu                         sync.RWMutex
 	sackEnabled                bool
-	recovery                   Recovery
+	recovery                   tcpip.TCPRecovery
 	delayEnabled               bool
-	sendBufferSize             SendBufferSizeOption
-	recvBufferSize             ReceiveBufferSizeOption
+	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
+	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
@@ -207,14 +161,14 @@ func (*protocol) Number() tcpip.TransportProtocolNumber {
 }
 
 // NewEndpoint creates a new tcp endpoint.
-func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newEndpoint(stack, netProto, waiterQueue), nil
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(p.stack, netProto, waiterQueue), nil
 }
 
 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return raw.NewEndpoint(stack, netProto, header.TCPProtocolNumber, waiterQueue)
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid tcp packet size.
@@ -244,21 +198,20 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	if !s.parse() || !s.csumValid {
-		return false
+		return stack.UnknownDestinationPacketMalformed
 	}
 
-	// There's nothing to do if this is already a reset packet.
-	if s.flagIsSet(header.TCPFlagRst) {
-		return true
+	if !s.flagIsSet(header.TCPFlagRst) {
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 	}
 
-	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
-	return true
+	return stack.UnknownDestinationPacketHandled
 }
 
 // replyWithReset replies to the given segment with a reset segment.
@@ -296,49 +249,49 @@ func replyWithReset(s *segment, tos, ttl uint8) {
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.Lock()
-		p.sackEnabled = bool(v)
+		p.sackEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.Lock()
-		p.recovery = Recovery(v)
+		p.recovery = *v
 		p.mu.Unlock()
 		return nil
 
-	case DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.Lock()
-		p.delayEnabled = bool(v)
+		p.delayEnabled = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.sendBufferSize = v
+		p.sendBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.recvBufferSize = v
+		p.recvBufferSize = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		for _, c := range p.availableCongestionControl {
-			if string(v) == c {
+			if string(*v) == c {
 				p.mu.Lock()
-				p.congestionControl = string(v)
+				p.congestionControl = string(*v)
 				p.mu.Unlock()
 				return nil
 			}
@@ -347,75 +300,79 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		// is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.Lock()
-		p.moderateReceiveBuffer = bool(v)
+		p.moderateReceiveBuffer = bool(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPLingerTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPLingerTimeoutOption:
 		p.mu.Lock()
-		p.lingerTimeout = time.Duration(v)
+		if *v < 0 {
+			p.lingerTimeout = 0
+		} else {
+			p.lingerTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitTimeoutOption:
-		if v < 0 {
-			v = 0
-		}
+	case *tcpip.TCPTimeWaitTimeoutOption:
 		p.mu.Lock()
-		p.timeWaitTimeout = time.Duration(v)
+		if *v < 0 {
+			p.timeWaitTimeout = 0
+		} else {
+			p.timeWaitTimeout = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPTimeWaitReuseOption:
-		if v < tcpip.TCPTimeWaitReuseDisabled || v > tcpip.TCPTimeWaitReuseLoopbackOnly {
+	case *tcpip.TCPTimeWaitReuseOption:
+		if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.timeWaitReuse = v
+		p.timeWaitReuse = *v
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMinRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMinRTOOption(MinRTO)
-		}
+	case *tcpip.TCPMinRTOOption:
 		p.mu.Lock()
-		p.minRTO = time.Duration(v)
+		if *v < 0 {
+			p.minRTO = MinRTO
+		} else {
+			p.minRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRTOOption:
-		if v < 0 {
-			v = tcpip.TCPMaxRTOOption(MaxRTO)
-		}
+	case *tcpip.TCPMaxRTOOption:
 		p.mu.Lock()
-		p.maxRTO = time.Duration(v)
+		if *v < 0 {
+			p.maxRTO = MaxRTO
+		} else {
+			p.maxRTO = time.Duration(*v)
+		}
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPMaxRetriesOption:
+	case *tcpip.TCPMaxRetriesOption:
 		p.mu.Lock()
-		p.maxRetries = uint32(v)
+		p.maxRetries = uint32(*v)
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRcvdCountThresholdOption:
+	case *tcpip.TCPSynRcvdCountThresholdOption:
 		p.mu.Lock()
-		p.synRcvdCount.SetThreshold(uint64(v))
+		p.synRcvdCount.SetThreshold(uint64(*v))
 		p.mu.Unlock()
 		return nil
 
-	case tcpip.TCPSynRetriesOption:
-		if v < 1 || v > 255 {
+	case *tcpip.TCPSynRetriesOption:
+		if *v < 1 || *v > 255 {
 			return tcpip.ErrInvalidOptionValue
 		}
 		p.mu.Lock()
-		p.synRetries = uint8(v)
+		p.synRetries = uint8(*v)
 		p.mu.Unlock()
 		return nil
 
@@ -425,33 +382,33 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	switch v := option.(type) {
-	case *SACKEnabled:
+	case *tcpip.TCPSACKEnabled:
 		p.mu.RLock()
-		*v = SACKEnabled(p.sackEnabled)
+		*v = tcpip.TCPSACKEnabled(p.sackEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *Recovery:
+	case *tcpip.TCPRecovery:
 		p.mu.RLock()
-		*v = Recovery(p.recovery)
+		*v = tcpip.TCPRecovery(p.recovery)
 		p.mu.RUnlock()
 		return nil
 
-	case *DelayEnabled:
+	case *tcpip.TCPDelayEnabled:
 		p.mu.RLock()
-		*v = DelayEnabled(p.delayEnabled)
+		*v = tcpip.TCPDelayEnabled(p.delayEnabled)
 		p.mu.RUnlock()
 		return nil
 
-	case *SendBufferSizeOption:
+	case *tcpip.TCPSendBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.sendBufferSize
 		p.mu.RUnlock()
 		return nil
 
-	case *ReceiveBufferSizeOption:
+	case *tcpip.TCPReceiveBufferSizeRangeOption:
 		p.mu.RLock()
 		*v = p.recvBufferSize
 		p.mu.RUnlock()
@@ -463,15 +420,15 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.AvailableCongestionControlOption:
+	case *tcpip.TCPAvailableCongestionControlOption:
 		p.mu.RLock()
-		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+		*v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.ModerateReceiveBufferOption:
+	case *tcpip.TCPModerateReceiveBufferOption:
 		p.mu.RLock()
-		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
+		*v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
 		p.mu.RUnlock()
 		return nil
 
@@ -546,33 +503,19 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter {
 
 // Parse implements stack.TransportProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
-	// TCP header is variable length, peek at it first.
-	hdrLen := header.TCPMinimumSize
-	hdr, ok := pkt.Data.PullUp(hdrLen)
-	if !ok {
-		return false
-	}
-
-	// If the header has options, pull those up as well.
-	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
-		// TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
-		// packets.
-		hdrLen = offset
-	}
-
-	_, ok = pkt.TransportHeader().Consume(hdrLen)
-	return ok
+	return parse.TCP(pkt)
 }
 
 // NewProtocol returns a TCP transport protocol.
-func NewProtocol() stack.TransportProtocol {
+func NewProtocol(s *stack.Stack) stack.TransportProtocol {
 	p := protocol{
-		sendBufferSize: SendBufferSizeOption{
+		stack: s,
+		sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultSendBufferSize,
 			Max:     MaxBufferSize,
 		},
-		recvBufferSize: ReceiveBufferSizeOption{
+		recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
 			Min:     MinBufferSize,
 			Default: DefaultReceiveBufferSize,
 			Max:     MaxBufferSize,
@@ -587,7 +530,7 @@ func NewProtocol() stack.TransportProtocol {
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
 		maxRetries:                 MaxRetries,
-		recovery:                   RACKLossDetection,
+		recovery:                   tcpip.TCPRACKLossDetection,
 	}
 	p.dispatcher.init(runtime.GOMAXPROCS(0))
 	return &p
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
index d969ca23a..d312b1b8b 100644
--- a/pkg/tcpip/transport/tcp/rack.go
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -29,26 +29,36 @@ import (
 //
 // +stateify savable
 type rackControl struct {
-	// xmitTime is the latest transmission timestamp of rackControl.seg.
-	xmitTime time.Time `state:".(unixTime)"`
-
 	// endSequence is the ending TCP sequence number of rackControl.seg.
 	endSequence seqnum.Value
 
+	// dsack indicates if the connection has seen a DSACK.
+	dsack bool
+
 	// fack is the highest selectively or cumulatively acknowledged
 	// sequence.
 	fack seqnum.Value
 
+	// minRTT is the estimated minimum RTT of the connection.
+	minRTT time.Duration
+
 	// rtt is the RTT of the most recently delivered packet on the
 	// connection (either cumulatively acknowledged or selectively
 	// acknowledged) that was not marked invalid as a possible spurious
 	// retransmission.
 	rtt time.Duration
+
+	// reorderSeen indicates if reordering has been detected on this
+	// connection.
+	reorderSeen bool
+
+	// xmitTime is the latest transmission timestamp of rackControl.seg.
+	xmitTime time.Time `state:".(unixTime)"`
 }
 
-// Update will update the RACK related fields when an ACK has been received.
+// update will update the RACK related fields when an ACK has been received.
 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
-func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration, offset uint32) {
+func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) {
 	rtt := time.Now().Sub(seg.xmitTime)
 
 	// If the ACK is for a retransmitted packet, do not update if it is a
@@ -65,12 +75,21 @@ func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration,
 				return
 			}
 		}
-		if rtt < srtt {
+		if rtt < rc.minRTT {
 			return
 		}
 	}
 
 	rc.rtt = rtt
+
+	// The sender can either track a simple global minimum of all RTT
+	// measurements from the connection, or a windowed min-filtered value
+	// of recent RTT measurements. This implementation keeps track of the
+	// simple global minimum of all RTTs for the connection.
+	if rtt < rc.minRTT || rc.minRTT == 0 {
+		rc.minRTT = rtt
+	}
+
 	// Update rc.xmitTime and rc.endSequence to the transmit time and
 	// ending sequence number of the packet which has been acknowledged
 	// most recently.
@@ -80,3 +99,26 @@ func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration,
 		rc.endSequence = endSeq
 	}
 }
+
+// detectReorder detects if packet reordering has been observed.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// * Step 3: Detect data segment reordering.
+//   To detect reordering, the sender looks for original data segments being
+//   delivered out of order. To detect such cases, the sender tracks the
+//   highest sequence selectively or cumulatively acknowledged in the RACK.fack
+//   variable. The name "fack" stands for the most "Forward ACK" (this term is
+//   adopted from [FACK]). If a never retransmitted segment that's below
+//   RACK.fack is (selectively or cumulatively) acknowledged, it has been
+//   delivered out of order. The sender sets RACK.reord to TRUE if such segment
+//   is identified.
+func (rc *rackControl) detectReorder(seg *segment) {
+	endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+	if rc.fack.LessThan(endSeq) {
+		rc.fack = endSeq
+		return
+	}
+
+	if endSeq.LessThan(rc.fack) && seg.xmitCount == 1 {
+		rc.reorderSeen = true
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5e0bfe585..8e0b7c843 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -43,26 +43,32 @@ type receiver struct {
 	// rcvWnd is the non-scaled receive window last advertised to the peer.
 	rcvWnd seqnum.Size
 
+	// rcvWUP is the rcvNxt value at the last window update sent.
+	rcvWUP seqnum.Value
+
 	rcvWndScale uint8
 
 	closed bool
 
+	// pendingRcvdSegments is bounded by the receive buffer size of the
+	// endpoint.
 	pendingRcvdSegments segmentHeap
-	pendingBufUsed      seqnum.Size
-	pendingBufSize      seqnum.Size
+	// pendingBufUsed tracks the total number of bytes (including segment
+	// overhead) currently queued in pendingRcvdSegments.
+	pendingBufUsed int
 
 	// Time when the last ack was received.
 	lastRcvdAckTime time.Time `state:".(unixTime)"`
 }
 
-func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
+func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver {
 	return &receiver{
 		ep:              ep,
 		rcvNxt:          irs + 1,
 		rcvAcc:          irs.Add(rcvWnd + 1),
 		rcvWnd:          rcvWnd,
+		rcvWUP:          irs + 1,
 		rcvWndScale:     rcvWndScale,
-		pendingBufSize:  pendingBufSize,
 		lastRcvdAckTime: time.Now(),
 	}
 }
@@ -82,19 +88,54 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
 }
 
+// currentWindow returns the available space in the window that was advertised
+// last to our peer.
+func (r *receiver) currentWindow() (curWnd seqnum.Size) {
+	endOfWnd := r.rcvWUP.Add(r.rcvWnd)
+	if endOfWnd.LessThan(r.rcvNxt) {
+		// return 0 if r.rcvNxt is past the end of the previously advertised window.
+		// This can happen because we accept a large segment completely even if
+		// accepting it causes it to partially exceed the advertised window.
+		return 0
+	}
+	return r.rcvNxt.Size(endOfWnd)
+}
+
 // getSendParams returns the parameters needed by the sender when building
 // segments to send.
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
-	// Calculate the window size based on the available buffer space.
-	receiveBufferAvailable := r.ep.receiveBufferAvailable()
-	acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable))
-	if r.rcvAcc.LessThan(acc) {
-		r.rcvAcc = acc
+	newWnd := r.ep.selectWindow()
+	curWnd := r.currentWindow()
+	// Update rcvAcc only if new window is > previously advertised window. We
+	// should never shrink the acceptable sequence space once it has been
+	// advertised the peer. If we shrink the acceptable sequence space then we
+	// would end up dropping bytes that might already be in flight.
+	// ====================================================  sequence space.
+	// ^             ^               ^                   ^
+	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
+	//               <=====curWnd ===>
+	//               <========= newWnd > curWnd ========= >
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+		// If the new window moves the right edge, then update rcvAcc.
+		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
+	} else {
+		if newWnd == 0 {
+			// newWnd is zero but we can't advertise a zero as it would cause window
+			// to shrink so just increment a metric to record this event.
+			r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment()
+		}
+		newWnd = curWnd
 	}
 	// Stash away the non-scaled receive window as we use it for measuring
 	// receiver's estimated RTT.
-	r.rcvWnd = r.rcvNxt.Size(r.rcvAcc)
-	return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
+	r.rcvWnd = newWnd
+	r.rcvWUP = r.rcvNxt
+	scaledWnd := r.rcvWnd >> r.rcvWndScale
+	if scaledWnd == 0 {
+		// Increment a metric if we are advertising an actual zero window.
+		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
+	}
+	return r.rcvNxt, scaledWnd
 }
 
 // nonZeroWindow is called when the receive window grows from zero to nonzero;
@@ -195,7 +236,9 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		}
 
 		for i := first; i < len(r.pendingRcvdSegments); i++ {
+			r.pendingBufUsed -= r.pendingRcvdSegments[i].segMemSize()
 			r.pendingRcvdSegments[i].decRef()
+
 			// Note that slice truncation does not allow garbage collection of
 			// truncated items, thus truncated items must be set to nil to avoid
 			// memory leaks.
@@ -268,14 +311,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	// If we are in one of the shutdown states then we need to do
 	// additional checks before we try and process the segment.
 	switch state {
-	case StateCloseWait:
-		// If the ACK acks something not yet sent then we send an ACK.
-		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
-			r.ep.snd.sendAck()
-			return true, nil
-		}
-		fallthrough
-	case StateClosing, StateLastAck:
+	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
 			// Just drop the segment as we have
 			// already received a FIN and this
@@ -284,9 +320,31 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 			return true, nil
 		}
 		fallthrough
-	case StateFinWait1:
-		fallthrough
-	case StateFinWait2:
+	case StateFinWait1, StateFinWait2:
+		// If the ACK acks something not yet sent then we send an ACK.
+		//
+		// RFC793, page 37: If the connection is in a synchronized state,
+		// (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK,
+		// TIME-WAIT), any unacceptable segment (out of window sequence number
+		// or unacceptable acknowledgment number) must elicit only an empty
+		// acknowledgment segment containing the current send-sequence number
+		// and an acknowledgment indicating the next sequence number expected
+		// to be received, and the connection remains in the same state.
+		//
+		// Just as on Linux, we do not apply this behavior when state is
+		// ESTABLISHED.
+		// Linux receive processing for all states except ESTABLISHED and
+		// TIME_WAIT is here where if the ACK check fails, we attempt to
+		// reply back with an ACK with correct seq/ack numbers.
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186
+		// The ESTABLISHED state processing is here where if the ACK check
+		// fails, we ignore the packet:
+		// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
+		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
+			r.ep.snd.sendAck()
+			return true, nil
+		}
+
 		// If we are closed for reads (either due to an
 		// incoming FIN or the user calling shutdown(..,
 		// SHUT_RD) then any data past the rcvNxt should
@@ -369,10 +427,16 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 	// Defer segment processing if it can't be consumed now.
 	if !r.consumeSegment(s, segSeq, segLen) {
 		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
-			// We only store the segment if it's within our buffer
-			// size limit.
-			if r.pendingBufUsed < r.pendingBufSize {
-				r.pendingBufUsed += seqnum.Size(s.segMemSize())
+			// We only store the segment if it's within our buffer size limit.
+			//
+			// Only use 75% of the receive buffer queue for out-of-order
+			// segments. This ensures that we always leave some space for the inorder
+			// segments to arrive allowing pending segments to be processed and
+			// delivered to the user.
+			if r.ep.receiveBufferAvailable() > 0 && r.pendingBufUsed < r.ep.receiveBufferSize()>>2 {
+				r.ep.rcvListMu.Lock()
+				r.pendingBufUsed += s.segMemSize()
+				r.ep.rcvListMu.Unlock()
 				s.incRef()
 				heap.Push(&r.pendingRcvdSegments, s)
 				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
@@ -406,7 +470,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		}
 
 		heap.Pop(&r.pendingRcvdSegments)
-		r.pendingBufUsed -= seqnum.Size(s.segMemSize())
+		r.ep.rcvListMu.Lock()
+		r.pendingBufUsed -= s.segMemSize()
+		r.ep.rcvListMu.Unlock()
 		s.decRef()
 	}
 	return false, nil
@@ -421,6 +487,13 @@ func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn
 	// Just silently drop any RST packets in TIME_WAIT. We do not support
 	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
 	// in https://tools.ietf.org/html/rfc1337#section-3.
+	//
+	// This behavior overrides RFC793 page 70 where we transition to CLOSED
+	// on receiving RST, which is also default Linux behavior.
+	// On Linux the RST can be ignored by setting sysctl net.ipv4.tcp_rfc1337.
+	//
+	// As we do not yet support PAWS, we are being conservative in ignoring
+	// RSTs by default.
 	if s.flagIsSet(header.TCPFlagRst) {
 		return false, false
 	}
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 7ef2df377..833a7b470 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -164,7 +164,7 @@ func (s *SACKScoreboard) IsSACKED(r header.SACKBlock) bool {
 	return found
 }
 
-// Dump prints the state of the scoreboard structure.
+// String returns human-readable state of the scoreboard structure.
 func (s *SACKScoreboard) String() string {
 	var str strings.Builder
 	str.WriteString("SACKScoreboard: {")
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index a20755f78..1f9c5cf50 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"fmt"
 	"sync/atomic"
 	"time"
 
@@ -24,19 +25,29 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
+// queueFlags are used to indicate which queue of an endpoint a particular segment
+// belongs to. This is used to track memory accounting correctly.
+type queueFlags uint8
+
+const (
+	recvQ queueFlags = 1 << iota
+	sendQ
+)
+
 // segment represents a TCP segment. It holds the payload and parsed TCP segment
 // information, and can be added to intrusive lists.
 // segment is mostly immutable, the only field allowed to change is viewToDeliver.
 //
 // +stateify savable
 type segment struct {
-	segEntry     segmentEntry
-	rackSegEntry rackSegmentEntry
-	refCnt       int32
-	id           stack.TransportEndpointID `state:"manual"`
-	route        stack.Route               `state:"manual"`
-	data         buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
-	hdr          header.TCP
+	segmentEntry
+	refCnt int32
+	ep     *endpoint
+	qFlags queueFlags
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -60,17 +71,10 @@ type segment struct {
 	// xmitTime is the last transmit time of this segment.
 	xmitTime  time.Time `state:".(unixTime)"`
 	xmitCount uint32
-}
-
-// segmentMapper is the ElementMapper for the writeList.
-type segmentMapper struct{}
-
-func (segmentMapper) linkerFor(seg *segment) *segmentEntry { return &seg.segEntry }
-
-// rackSegmentMapper is the ElementMapper for the rcList.
-type rackSegmentMapper struct{}
 
-func (rackSegmentMapper) linkerFor(seg *segment) *rackSegmentEntry { return &seg.rackSegEntry }
+	// acked indicates if the segment has already been SACKed.
+	acked bool
+}
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
@@ -111,6 +115,8 @@ func (s *segment) clone() *segment {
 		rcvdTime:       s.rcvdTime,
 		xmitTime:       s.xmitTime,
 		xmitCount:      s.xmitCount,
+		ep:             s.ep,
+		qFlags:         s.qFlags,
 	}
 	t.data = s.data.Clone(t.views[:])
 	return t
@@ -126,8 +132,34 @@ func (s *segment) flagsAreSet(flags uint8) bool {
 	return s.flags&flags == flags
 }
 
+// setOwner sets the owning endpoint for this segment. Its required
+// to be called to ensure memory accounting for receive/send buffer
+// queues is done properly.
+func (s *segment) setOwner(ep *endpoint, qFlags queueFlags) {
+	switch qFlags {
+	case recvQ:
+		ep.updateReceiveMemUsed(s.segMemSize())
+	case sendQ:
+		// no memory account for sendQ yet.
+	default:
+		panic(fmt.Sprintf("unexpected queue flag %b", qFlags))
+	}
+	s.ep = ep
+	s.qFlags = qFlags
+}
+
 func (s *segment) decRef() {
 	if atomic.AddInt32(&s.refCnt, -1) == 0 {
+		if s.ep != nil {
+			switch s.qFlags {
+			case recvQ:
+				s.ep.updateReceiveMemUsed(-s.segMemSize())
+			case sendQ:
+				// no memory accounting for sendQ yet.
+			default:
+				panic(fmt.Sprintf("unexpected queue flag %b set for segment", s.qFlags))
+			}
+		}
 		s.route.Release()
 	}
 }
@@ -149,6 +181,11 @@ func (s *segment) logicalLen() seqnum.Size {
 	return l
 }
 
+// payloadSize is the size of s.data.
+func (s *segment) payloadSize() int {
+	return s.data.Size()
+}
+
 // segMemSize is the amount of memory used to hold the segment data and
 // the associated metadata.
 func (s *segment) segMemSize() int {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 48a257137..54545a1b1 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -22,16 +22,16 @@ import (
 //
 // +stateify savable
 type segmentQueue struct {
-	mu    sync.Mutex  `state:"nosave"`
-	list  segmentList `state:"wait"`
-	limit int
-	used  int
+	mu     sync.Mutex  `state:"nosave"`
+	list   segmentList `state:"wait"`
+	ep     *endpoint
+	frozen bool
 }
 
 // emptyLocked determines if the queue is empty.
 // Preconditions: q.mu must be held.
 func (q *segmentQueue) emptyLocked() bool {
-	return q.used == 0
+	return q.list.Empty()
 }
 
 // empty determines if the queue is empty.
@@ -43,14 +43,6 @@ func (q *segmentQueue) empty() bool {
 	return r
 }
 
-// setLimit updates the limit. No segments are immediately dropped in case the
-// queue becomes full due to the new limit.
-func (q *segmentQueue) setLimit(limit int) {
-	q.mu.Lock()
-	q.limit = limit
-	q.mu.Unlock()
-}
-
 // enqueue adds the given segment to the queue.
 //
 // Returns true when the segment is successfully added to the queue, in which
@@ -58,15 +50,23 @@ func (q *segmentQueue) setLimit(limit int) {
 // false if the queue is full, in which case ownership is retained by the
 // caller.
 func (q *segmentQueue) enqueue(s *segment) bool {
+	// q.ep.receiveBufferParams() must be called without holding q.mu to
+	// avoid lock order inversion.
+	bufSz := q.ep.receiveBufferSize()
+	used := q.ep.receiveMemUsed()
 	q.mu.Lock()
-	r := q.used < q.limit
-	if r {
+	// Allow zero sized segments (ACK/FIN/RSTs etc even if the segment queue
+	// is currently full).
+	allow := (used <= bufSz || s.payloadSize() == 0) && !q.frozen
+
+	if allow {
 		q.list.PushBack(s)
-		q.used++
+		// Set the owner now that the endpoint owns the segment.
+		s.setOwner(q.ep, recvQ)
 	}
 	q.mu.Unlock()
 
-	return r
+	return allow
 }
 
 // dequeue removes and returns the next segment from queue, if one exists.
@@ -77,9 +77,25 @@ func (q *segmentQueue) dequeue() *segment {
 	s := q.list.Front()
 	if s != nil {
 		q.list.Remove(s)
-		q.used--
 	}
 	q.mu.Unlock()
 
 	return s
 }
+
+// freeze prevents any more segments from being added to the queue. i.e all
+// future segmentQueue.enqueue will return false and not add the segment to the
+// queue till the queue is unfroze with a corresponding segmentQueue.thaw call.
+func (q *segmentQueue) freeze() {
+	q.mu.Lock()
+	q.frozen = true
+	q.mu.Unlock()
+}
+
+// thaw unfreezes a previously frozen queue using segmentQueue.freeze() and
+// allows new segments to be queued again.
+func (q *segmentQueue) thaw() {
+	q.mu.Lock()
+	q.frozen = false
+	q.mu.Unlock()
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 31151f23d..6fa8d63cd 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"fmt"
 	"math"
+	"sort"
 	"sync/atomic"
 	"time"
 
@@ -154,7 +155,6 @@ type sender struct {
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
-	rcList      rackSegmentList
 	resendTimer timer       `state:"nosave"`
 	resendWaker sleep.Waker `state:"nosave"`
 
@@ -264,6 +264,9 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 			highRxt:   iss,
 			rescueRxt: iss,
 		},
+		rc: rackControl{
+			fack: iss,
+		},
 		gso: ep.gso != nil,
 	}
 
@@ -368,7 +371,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 
 	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
 	// if it is already before such a packet.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
 		if seg == s.writeNext {
 			// We got to writeNext before we could find a segment
 			// exceeding the MTU.
@@ -623,7 +626,6 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
-	s.rcList.InsertAfter(seg, nSeg)
 
 	// The segment being split does not carry PUSH flag because it is
 	// followed by the newly split segment.
@@ -655,7 +657,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	var s3 *segment
 	var s4 *segment
 	// Step 1.
-	for seg := nextSegHint; seg != nil; seg = seg.segEntry.Next() {
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
 		// Stop iteration if we hit a segment that has never been
 		// transmitted (i.e. either it has no assigned sequence number
 		// or if it does have one, it's >= the next sequence number
@@ -685,7 +687,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, seg.segEntry.Next(), false
+					return seg, seg.Next(), false
 				}
 
 				// NextSeg():
@@ -699,7 +701,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
-					hint = seg.segEntry.Next()
+					hint = seg.Next()
 				}
 			}
 			// NextSeg():
@@ -733,7 +735,7 @@ func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRt
 	// range of one segment of up to SMSS octets of
 	// previously unsent data starting with sequence number
 	// HighData+1 MUST be returned."
-	for seg := s.writeNext; seg != nil; seg = seg.segEntry.Next() {
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
 		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
 			continue
 		}
@@ -775,16 +777,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			// triggering bugs in poorly written DNS
 			// implementations.
 			var nextTooBig bool
-			for seg.segEntry.Next() != nil && seg.segEntry.Next().data.Size() != 0 {
-				if seg.data.Size()+seg.segEntry.Next().data.Size() > available {
+			for seg.Next() != nil && seg.Next().data.Size() != 0 {
+				if seg.data.Size()+seg.Next().data.Size() > available {
 					nextTooBig = true
 					break
 				}
-				seg.data.Append(seg.segEntry.Next().data)
+				seg.data.Append(seg.Next().data)
 
 				// Consume the segment that we just merged in.
-				s.writeList.Remove(seg.segEntry.Next())
-				s.rcList.Remove(seg.rackSegEntry.Next())
+				s.writeList.Remove(seg.Next())
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
@@ -951,7 +952,7 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 			}
 			dataSent = true
 			s.outstanding++
-			s.writeNext = nextSeg.segEntry.Next()
+			s.writeNext = nextSeg.Next()
 			continue
 		}
 
@@ -964,7 +965,6 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		// transmitted in (C.1)."
 		s.outstanding++
 		dataSent = true
-
 		s.sendSegment(nextSeg)
 
 		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
@@ -1039,7 +1039,7 @@ func (s *sender) sendData() {
 	if s.fr.active && s.ep.sackPermitted {
 		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
 	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.segEntry.Next() {
+		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
 			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
 			if cwndLimit < limit {
 				limit = cwndLimit
@@ -1047,7 +1047,7 @@ func (s *sender) sendData() {
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
 				// Move writeNext along so that we don't try and scan data that
 				// has already been SACKED.
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -1055,7 +1055,7 @@ func (s *sender) sendData() {
 			}
 			dataSent = true
 			s.outstanding += s.pCount(seg)
-			s.writeNext = seg.segEntry.Next()
+			s.writeNext = seg.Next()
 		}
 	}
 
@@ -1186,7 +1186,7 @@ func (s *sender) SetPipe() {
 	}
 	pipe := 0
 	smss := seqnum.Size(s.ep.scoreboard.SMSS())
-	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.segEntry.Next() {
+	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
 		// With GSO each segment can be much larger than SMSS. So check the segment
 		// in SMSS sized ranges.
 		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
@@ -1278,6 +1278,39 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 	return true
 }
 
+// Iterate the writeList and update RACK for each segment which is newly acked
+// either cumulatively or selectively. Loop through the segments which are
+// sacked, and update the RACK related variables and check for reordering.
+//
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// steps 2 and 3.
+func (s *sender) walkSACK(rcvdSeg *segment) {
+	// Sort the SACK blocks. The first block is the most recent unacked
+	// block. The following blocks can be in arbitrary order.
+	sackBlocks := make([]header.SACKBlock, len(rcvdSeg.parsedOptions.SACKBlocks))
+	copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks)
+	sort.Slice(sackBlocks, func(i, j int) bool {
+		return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
+	})
+
+	seg := s.writeList.Front()
+	for _, sb := range sackBlocks {
+		// This check excludes DSACK blocks.
+		if sb.Start.LessThanEq(rcvdSeg.ackNumber) || sb.Start.LessThanEq(s.sndUna) || s.sndNxt.LessThan(sb.End) {
+			continue
+		}
+
+		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
+			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
+				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.detectReorder(seg)
+				seg.acked = true
+			}
+			seg = seg.Next()
+		}
+	}
+}
+
 // handleRcvdSegment is called when a segment is received; it is responsible for
 // updating the send-related state.
 func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
@@ -1312,6 +1345,21 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 				rcvdSeg.hasNewSACKInfo = true
 			}
 		}
+
+		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
+		// section-7.2
+		// * Step 2: Update RACK stats.
+		//   If the ACK is not ignored as invalid, update the RACK.rtt
+		//   to be the RTT sample calculated using this ACK, and
+		//   continue.  If this ACK or SACK was for the most recently
+		//   sent packet, then record the RACK.xmit_ts timestamp and
+		//   RACK.end_seq sequence implied by this ACK.
+		// * Step 3: Detect packet reordering.
+		//   If the ACK selectively or cumulatively acknowledges an
+		//   unacknowledged and also never retransmitted sequence below
+		//   RACK.fack, then the corresponding packet has been
+		//   reordered and RACK.reord is set to TRUE.
+		s.walkSACK(rcvdSeg)
 		s.SetPipe()
 	}
 
@@ -1369,9 +1417,6 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 
 		ackLeft := acked
 		originalOutstanding := s.outstanding
-		s.rtt.Lock()
-		srtt := s.rtt.srtt
-		s.rtt.Unlock()
 		for ackLeft > 0 {
 			// We use logicalLen here because we can have FIN
 			// segments (which are always at the end of list) that
@@ -1388,18 +1433,18 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			}
 
 			if s.writeNext == seg {
-				s.writeNext = seg.segEntry.Next()
+				s.writeNext = seg.Next()
 			}
 
 			// Update the RACK fields if SACK is enabled.
-			if s.ep.sackPermitted {
-				s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset)
+			if s.ep.sackPermitted && !seg.acked {
+				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.detectReorder(seg)
 			}
 
 			s.writeList.Remove(seg)
-			s.rcList.Remove(seg)
 
-			// if SACK is enabled then Only reduce outstanding if
+			// If SACK is enabled then Only reduce outstanding if
 			// the segment was not previously SACKED as these have
 			// already been accounted for in SetPipe().
 			if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
@@ -1465,12 +1510,6 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		if s.sndCwnd < s.sndSsthresh {
 			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
 		}
-
-		// Move the segment which has to be retransmitted to the end of the list, as
-		// RACK requires the segments in the order of their transmission times.
-		// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
-		// Step 5
-		s.rcList.PushBack(seg)
 	}
 	seg.xmitTime = time.Now()
 	seg.xmitCount++
diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go
index e03f101e8..d3f92b48c 100644
--- a/pkg/tcpip/transport/tcp/tcp_rack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go
@@ -21,17 +21,20 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
 )
 
+const (
+	maxPayload       = 10
+	tsOptionSize     = 12
+	maxTCPOptionSize = 40
+)
+
 // TestRACKUpdate tests the RACK related fields are updated when an ACK is
 // received on a SACK enabled connection.
 func TestRACKUpdate(t *testing.T) {
-	const maxPayload = 10
-	const tsOptionSize = 12
-	const maxTCPOptionSize = 40
-
 	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
 	defer c.Cleanup()
 
@@ -49,7 +52,7 @@ func TestRACKUpdate(t *testing.T) {
 		}
 
 		if state.Sender.RACKState.RTT == 0 {
-			t.Fatalf("RACK RTT failed to update when an ACK is received")
+			t.Fatalf("RACK RTT failed to update when an ACK is received, got RACKState.RTT == 0 want != 0")
 		}
 	})
 	setStackSACKPermitted(t, c, true)
@@ -69,6 +72,66 @@ func TestRACKUpdate(t *testing.T) {
 	bytesRead := 0
 	c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
 	bytesRead += maxPayload
-	c.SendAck(790, bytesRead)
+	c.SendAck(seqnum.Value(context.TestInitialSequenceNumber).Add(1), bytesRead)
 	time.Sleep(200 * time.Millisecond)
 }
+
+// TestRACKDetectReorder tests that RACK detects packet reordering.
+func TestRACKDetectReorder(t *testing.T) {
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
+	defer c.Cleanup()
+
+	const ackNum = 2
+
+	var n int
+	ch := make(chan struct{})
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		gotSeq := state.Sender.RACKState.FACK
+		wantSeq := state.Sender.SndNxt
+		// FACK should be updated to the highest ending sequence number of the
+		// segment acknowledged most recently.
+		if !gotSeq.LessThanEq(wantSeq) || gotSeq.LessThan(wantSeq) {
+			t.Fatalf("RACK FACK failed to update, got: %v, but want: %v", gotSeq, wantSeq)
+		}
+
+		n++
+		if n < ackNum {
+			if state.Sender.RACKState.Reord {
+				t.Fatalf("RACK reorder detected when there is no reordering")
+			}
+			return
+		}
+
+		if state.Sender.RACKState.Reord == false {
+			t.Fatalf("RACK reorder detection failed")
+		}
+		close(ch)
+	})
+	setStackSACKPermitted(t, c, true)
+	createConnectedWithSACKAndTS(c)
+	data := buffer.NewView(ackNum * maxPayload)
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write the data.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	bytesRead := 0
+	for i := 0; i < ackNum; i++ {
+		c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+		bytesRead += maxPayload
+	}
+
+	start := c.IRS.Add(maxPayload + 1)
+	end := start.Add(maxPayload)
+	seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+	c.SendAckWithSACK(seq, 0, []header.SACKBlock{{start, end}})
+	c.SendAck(seq, bytesRead)
+
+	// Wait for the probe function to finish processing the ACK before the
+	// test completes.
+	<-ch
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 99521f0c1..ef7f5719f 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -46,8 +46,9 @@ func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint {
 
 func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) {
 	t.Helper()
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enable)); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%t) = %s", enable, err)
+	opt := tcpip.TCPSACKEnabled(enable)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("c.s.SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -162,8 +163,9 @@ func TestSackPermittedAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 					setStackSACKPermitted(t, c, sackEnabled)
@@ -236,8 +238,9 @@ func TestSackDisabledAccept(t *testing.T) {
 						// Set the SynRcvd threshold to
 						// zero to force a syn cookie
 						// based accept to happen.
-						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						var opt tcpip.TCPSynRcvdCountThresholdOption
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 55ae09a2f..5f05608e2 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -74,8 +75,8 @@ func TestGiveUpConnect(t *testing.T) {
 
 	// Wait for ep to become writable.
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %s, want = %s", err, tcpip.ErrAborted)
+	if err := ep.LastError(); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
 	}
 
 	// Call Connect again to retreive the handshake failure status
@@ -240,6 +241,38 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 	}
 }
 
+// TestTCPResetsSentNoICMP confirms that we don't get an ICMP
+// DstUnreachable packet when we try send a packet which is not part
+// of an active session.
+func TestTCPResetsSentNoICMP(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	stats := c.Stack().Stats()
+
+	// Send a SYN request for a closed port. This should elicit an RST
+	// but NOT an ICMPv4 DstUnreachable packet.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive whatever comes back.
+	b := c.GetPacket()
+	ipHdr := header.IPv4(b)
+	if got, want := ipHdr.Protocol(), uint8(header.TCPProtocolNumber); got != want {
+		t.Errorf("unexpected protocol, got = %d, want = %d", got, want)
+	}
+
+	// Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded.
+	sent := stats.ICMP.V4PacketsSent
+	if got, want := sent.DstUnreachable.Value(), uint64(0); got != want {
+		t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want)
+	}
+}
+
 // TestTCPResetSentForACKWhenNotUsingSynCookies checks that the stack generates
 // a RST if an ACK is received on the listening socket for which there is no
 // active handshake in progress and we are not using SYN cookies.
@@ -291,12 +324,12 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -309,16 +342,16 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// Lower stackwide TIME_WAIT timeout so that the reservations
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %#v) = %s", tcp.ProtocolNumber, tcpTW, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &tcpTW); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, tcpTW, tcpTW, err)
 	}
 
 	c.EP.Close()
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 	finHeaders := &context.Headers{
 		SrcPort: context.TestPort,
@@ -348,8 +381,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 }
 
@@ -432,8 +465,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 	// Set TCPLinger to 3 seconds so that sockets are marked closed
 	// after 3 second in FIN_WAIT2 state.
 	tcpLingerTimeout := 3 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%s) failed: %s", tcpLingerTimeout, err)
+	opt := tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -446,8 +480,8 @@ func TestConnectResetAfterClose(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -488,8 +522,8 @@ func TestConnectResetAfterClose(t *testing.T) {
 				// RST is always generated with sndNxt which if the FIN
 				// has been sent will be 1 higher than the sequence number
 				// of the FIN itself.
-				checker.SeqNum(uint32(c.IRS)+2),
-				checker.AckNum(0),
+				checker.TCPSeqNum(uint32(c.IRS)+2),
+				checker.TCPAckNum(0),
 				checker.TCPFlags(header.TCPFlagRst),
 			),
 		)
@@ -506,8 +540,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -527,8 +562,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -563,8 +598,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -610,8 +645,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -631,8 +666,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -691,8 +726,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(0),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(0),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -743,8 +778,8 @@ func TestSimpleReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -933,8 +968,8 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
 
 					// Set the SynRcvd threshold to force a syn cookie based accept to happen.
 					opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts)
-					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-						t.Fatalf("SetTransportProtocolOption(%d, %#v): %s", tcp.ProtocolNumber, opt, err)
+					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+						t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 					}
 
 					if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
@@ -996,7 +1031,7 @@ func TestSendRstOnListenerRxSynAckV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
@@ -1024,7 +1059,7 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 // TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
@@ -1061,8 +1096,8 @@ func TestTCPAckBeforeAcceptV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 // TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
@@ -1099,8 +1134,8 @@ func TestTCPAckBeforeAcceptV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestSendRstOnListenerRxAckV4(t *testing.T) {
@@ -1128,7 +1163,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 func TestSendRstOnListenerRxAckV6(t *testing.T) {
@@ -1156,7 +1191,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst),
-		checker.SeqNum(200)))
+		checker.TCPSeqNum(200)))
 }
 
 // TestListenShutdown tests for the listening endpoint replying with RST
@@ -1272,8 +1307,8 @@ func TestTOSV4(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790), // Acknum is initial sequence number + 1
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790), // Acknum is initial sequence number + 1
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 		checker.TOS(tos, 0),
@@ -1321,8 +1356,8 @@ func TestTrafficClassV6(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 		checker.TOS(tos, 0),
@@ -1349,7 +1384,9 @@ func TestConnectBindToDevice(t *testing.T) {
 
 			c.Create(-1)
 			bindToDevice := tcpip.BindToDeviceOption(test.device)
-			c.EP.SetSockOpt(bindToDevice)
+			if err := c.EP.SetSockOpt(&bindToDevice); err != nil {
+				t.Fatalf("c.EP.SetSockOpt(&%T(%d)): %s", bindToDevice, bindToDevice, err)
+			}
 			// Start connection attempt.
 			waitEntry, _ := waiter.NewChannelEntry(nil)
 			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
@@ -1510,8 +1547,8 @@ func TestOutOfOrderReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1561,8 +1598,8 @@ func TestOutOfOrderReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1572,8 +1609,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Create a new connection with initial window size of 10.
-	c.CreateConnected(789, 30000, 10)
+	rcvBufSz := math.MaxUint16
+	c.CreateConnected(789, 30000, rcvBufSz)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
@@ -1594,8 +1631,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 		checker.IPv4(t, c.GetPacket(),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(790),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -1615,8 +1652,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1635,8 +1672,8 @@ func TestOutOfOrderFlood(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(793),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(793),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1677,8 +1714,8 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1692,7 +1729,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
 			// We shouldn't consume a sequence number on RST.
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1746,8 +1783,8 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -1760,7 +1797,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
 		))
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
@@ -1779,7 +1816,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 			// RST is always generated with sndNxt which if the FIN
 			// has been sent will be 1 higher than the sequence
 			// number of the FIN itself.
-			checker.SeqNum(uint32(c.IRS)+2),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1825,7 +1862,8 @@ func TestFullWindowReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 30000, 10)
+	const rcvBufSz = 10
+	c.CreateConnected(789, 30000, rcvBufSz)
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1836,8 +1874,13 @@ func TestFullWindowReceive(t *testing.T) {
 		t.Fatalf("Read failed: %s", err)
 	}
 
-	// Fill up the window.
-	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	// Fill up the window w/ tcp.SegOverheadFactor*rcvBufSz as netstack multiplies
+	// the provided buffer value by tcp.SegOverheadFactor to calculate the actual
+	// receive buffer size.
+	data := make([]byte, tcp.SegOverheadFactor*rcvBufSz)
+	for i := range data {
+		data[i] = byte(i % 255)
+	}
 	c.SendPacket(data, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
@@ -1858,10 +1901,10 @@ func TestFullWindowReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(0),
+			checker.TCPWindow(0),
 		),
 	)
 
@@ -1884,10 +1927,10 @@ func TestFullWindowReceive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(10),
+			checker.TCPWindow(10),
 		),
 	)
 }
@@ -1896,12 +1939,15 @@ func TestNoWindowShrinking(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Start off with a window size of 10, then shrink it to 5.
-	c.CreateConnected(789, 30000, 10)
-
-	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
-		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
-	}
+	// Start off with a certain receive buffer then cut it in half and verify that
+	// the right edge of the window does not shrink.
+	// NOTE: Netstack doubles the value specified here.
+	rcvBufSize := 65536
+	iss := seqnum.Value(789)
+	// Enable window scaling with a scale of zero from our end.
+	c.CreateConnectedWithRawOptions(iss, 30000, rcvBufSize, []byte{
+		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
+	})
 
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1910,14 +1956,15 @@ func TestNoWindowShrinking(t *testing.T) {
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
-
-	// Send 3 bytes, check that the peer acknowledges them.
-	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
-	c.SendPacket(data[:3], &context.Headers{
+	// Send a 1 byte payload so that we can record the current receive window.
+	// Send a payload of half the size of rcvBufSize.
+	seqNum := iss.Add(1)
+	payload := []byte{1}
+	c.SendPacket(payload, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
-		SeqNum:  790,
+		SeqNum:  seqNum,
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
@@ -1929,46 +1976,93 @@ func TestNoWindowShrinking(t *testing.T) {
 		t.Fatalf("Timed out waiting for data to arrive")
 	}
 
-	// Check that data is acknowledged, and that window doesn't go to zero
-	// just yet because it was previously set to 10. It must go to 7 now.
-	checker.IPv4(t, c.GetPacket(),
+	// Read the 1 byte payload we just sent.
+	v, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+	if got, want := payload, v; !bytes.Equal(got, want) {
+		t.Fatalf("got data: %v, want: %v", got, want)
+	}
+
+	seqNum = seqNum.Add(1)
+	// Verify that the ACK does not shrink the window.
+	pkt := c.GetPacket()
+	checker.IPv4(t, pkt,
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(793),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(7),
 		),
 	)
+	// Stash the initial window.
+	initialWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+	initialLastAcceptableSeq := seqNum.Add(seqnum.Size(initialWnd))
+	// Now shrink the receive buffer to half its original size.
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufSize/2); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
+	}
 
-	// Send 7 more bytes, check that the window fills up.
-	c.SendPacket(data[3:], &context.Headers{
+	data := generateRandomPayload(t, rcvBufSize)
+	// Send a payload of half the size of rcvBufSize.
+	c.SendPacket(data[:rcvBufSize/2], &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
-		SeqNum:  793,
+		SeqNum:  seqNum,
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
+	seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
 
-	select {
-	case <-ch:
-	case <-time.After(5 * time.Second):
-		t.Fatalf("Timed out waiting for data to arrive")
+	// Verify that the ACK does not shrink the window.
+	pkt = c.GetPacket()
+	checker.IPv4(t, pkt,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+	newWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+	newLastAcceptableSeq := seqNum.Add(seqnum.Size(newWnd))
+	if newLastAcceptableSeq.LessThan(initialLastAcceptableSeq) {
+		t.Fatalf("receive window shrunk unexpectedly got: %d, want >= %d", newLastAcceptableSeq, initialLastAcceptableSeq)
 	}
 
+	// Send another payload of half the size of rcvBufSize. This should fill up the
+	// socket receive buffer and we should see a zero window.
+	c.SendPacket(data[rcvBufSize/2:], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqNum,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
+
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(0),
+			checker.TCPWindow(0),
 		),
 	)
 
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
 	// Receive data and check it.
-	read := make([]byte, 0, 10)
+	read := make([]byte, 0, rcvBufSize)
 	for len(read) < len(data) {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
@@ -1982,15 +2076,15 @@ func TestNoWindowShrinking(t *testing.T) {
 		t.Fatalf("got data = %v, want = %v", read, data)
 	}
 
-	// Check that we get an ACK for the newly non-zero window, which is the
-	// new size.
+	// Check that we get an ACK for the newly non-zero window, which is the new
+	// receive buffer size we set after the connection was established.
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(seqNum)),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.Window(5),
+			checker.TCPWindow(uint16(rcvBufSize/2)>>c.RcvdWindowScale),
 		),
 	)
 }
@@ -2015,8 +2109,8 @@ func TestSimpleSend(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2057,8 +2151,8 @@ func TestZeroWindowSend(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2079,8 +2173,8 @@ func TestZeroWindowSend(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2119,16 +2213,16 @@ func TestScaledWindowConnect(t *testing.T) {
 		t.Fatalf("Write failed: %s", err)
 	}
 
-	// Check that data is received, and that advertised window is 0xbfff,
+	// Check that data is received, and that advertised window is 0x5fff,
 	// that is, that it is scaled.
 	b := c.GetPacket()
 	checker.IPv4(t, b,
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xbfff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0x5fff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2158,9 +2252,9 @@ func TestNonScaledWindowConnect(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xffff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0xffff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2194,19 +2288,20 @@ func TestScaledWindowAccept(t *testing.T) {
 	}
 
 	// Do 3-way handshake.
-	c.PassiveConnectWithOptions(100, 2, header.TCPSynOptions{MSS: defaultIPv4MSS})
+	// wndScale expected is 3 as 65535 * 3 * 2 < 65535 * 2^3 but > 65535 *2 *2
+	c.PassiveConnectWithOptions(100, 3 /* wndScale */, header.TCPSynOptions{MSS: defaultIPv4MSS})
 
 	// Try to accept the connection.
 	we, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2224,16 +2319,16 @@ func TestScaledWindowAccept(t *testing.T) {
 		t.Fatalf("Write failed: %s", err)
 	}
 
-	// Check that data is received, and that advertised window is 0xbfff,
+	// Check that data is received, and that advertised window is 0x5fff,
 	// that is, that it is scaled.
 	b := c.GetPacket()
 	checker.IPv4(t, b,
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xbfff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0x5fff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2275,12 +2370,12 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2305,9 +2400,9 @@ func TestNonScaledWindowAccept(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(0xffff),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(0xffff),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2320,18 +2415,19 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	// Set the window size such that a window scale of 4 will be used.
-	const wnd = 65535 * 10
-	const ws = uint32(4)
-	c.CreateConnectedWithRawOptions(789, 30000, wnd, []byte{
+	// Set the buffer size such that a window scale of 5 will be used.
+	const bufSz = 65535 * 10
+	const ws = uint32(5)
+	c.CreateConnectedWithRawOptions(789, 30000, bufSz, []byte{
 		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
 	})
 
 	// Write chunks of 50000 bytes.
-	remain := wnd
+	remain := 0
 	sent := 0
 	data := make([]byte, 50000)
-	for remain > len(data) {
+	// Keep writing till the window drops below len(data).
+	for {
 		c.SendPacket(data, &context.Headers{
 			SrcPort: context.TestPort,
 			DstPort: c.Port,
@@ -2341,21 +2437,25 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 			RcvWnd:  30000,
 		})
 		sent += len(data)
-		remain -= len(data)
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(uint16(remain>>ws)),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Don't reduce window to zero here.
+		if wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()); wnd<<ws < len(data) {
+			remain = wnd << ws
+			break
+		}
 	}
 
 	// Make the window non-zero, but the scaled window zero.
-	if remain >= 16 {
+	for remain >= 16 {
 		data = data[:remain-15]
 		c.SendPacket(data, &context.Headers{
 			SrcPort: context.TestPort,
@@ -2366,22 +2466,35 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 			RcvWnd:  30000,
 		})
 		sent += len(data)
-		remain -= len(data)
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(0),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Since the receive buffer is split between window advertisement and
+		// application data buffer the window does not always reflect the space
+		// available and actual space available can be a bit more than what is
+		// advertised in the window.
+		wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize())
+		if wnd == 0 {
+			break
+		}
+		remain = wnd << ws
 	}
 
-	// Read at least 1MSS of data. An ack should be sent in response to that.
+	// Read at least 2MSS of data. An ack should be sent in response to that.
+	// Since buffer space is now split in half between window and application
+	// data we need to read more than 1 MSS(65536) of data for a non-zero window
+	// update to be sent. For 1MSS worth of window to be available we need to
+	// read at least 128KB. Since our segments above were 50KB each it means
+	// we need to read at 3 packets.
 	sz := 0
-	for sz < defaultMTU {
+	for sz < defaultMTU*2 {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Read failed: %s", err)
@@ -2393,9 +2506,9 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(sz>>ws)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindowGreaterThanEq(uint16(defaultMTU>>ws)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -2462,8 +2575,8 @@ func TestSegmentMerging(t *testing.T) {
 					checker.PayloadLen(header.TCPMinimumSize+1),
 					checker.TCP(
 						checker.DstPort(context.TestPort),
-						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
-						checker.AckNum(790),
+						checker.TCPSeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.TCPAckNum(790),
 						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 					),
 				)
@@ -2485,8 +2598,8 @@ func TestSegmentMerging(t *testing.T) {
 				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(uint32(c.IRS)+11),
-					checker.AckNum(790),
+					checker.TCPSeqNum(uint32(c.IRS)+11),
+					checker.TCPAckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
 			)
@@ -2533,8 +2646,8 @@ func TestDelay(t *testing.T) {
 			checker.PayloadLen(len(want)+header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(seq)),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(seq)),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -2580,8 +2693,8 @@ func TestUndelay(t *testing.T) {
 		checker.PayloadLen(len(allData[0])+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(seq)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(seq)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2603,8 +2716,8 @@ func TestUndelay(t *testing.T) {
 		checker.PayloadLen(len(allData[1])+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(seq)),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(seq)),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -2665,8 +2778,8 @@ func TestMSSNotDelayed(t *testing.T) {
 					checker.PayloadLen(len(data)+header.TCPMinimumSize),
 					checker.TCP(
 						checker.DstPort(context.TestPort),
-						checker.SeqNum(uint32(seq)),
-						checker.AckNum(790),
+						checker.TCPSeqNum(uint32(seq)),
+						checker.TCPAckNum(790),
 						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 					),
 				)
@@ -2717,8 +2830,8 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
-				checker.AckNum(790),
+				checker.TCPSeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -2838,12 +2951,12 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2865,8 +2978,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(0)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create EP and start listening.
@@ -2893,12 +3007,12 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -2959,7 +3073,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	// Set the buffer size to a deterministic size so that we can check the
 	// window scaling option.
 	const rcvBufferSize = 0x20000
-	const wndScale = 2
+	const wndScale = 3
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
@@ -2994,7 +3108,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagSyn),
 			checker.SrcPort(tcpHdr.SourcePort()),
-			checker.SeqNum(tcpHdr.SequenceNumber()),
+			checker.TCPSeqNum(tcpHdr.SequenceNumber()),
 			checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
 		),
 	)
@@ -3015,16 +3129,16 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+1),
 		),
 	)
 
 	// Wait for connection to be established.
 	select {
 	case <-ch:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
-			t.Fatalf("GetSockOpt failed: %s", err)
+		if err := c.EP.LastError(); err != nil {
+			t.Fatalf("Connect failed: %s", err)
 		}
 	case <-time.After(1 * time.Second):
 		t.Fatalf("Timed out waiting for connection")
@@ -3144,8 +3258,9 @@ func TestMaxRetransmitsTimeout(t *testing.T) {
 	defer c.Cleanup()
 
 	const numRetries = 2
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil {
-		t.Fatalf("could not set protocol option MaxRetries.\n")
+	opt := tcpip.TCPMaxRetriesOption(numRetries)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3204,8 +3319,9 @@ func TestMaxRTO(t *testing.T) {
 	defer c.Cleanup()
 
 	rto := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err)
+	opt := tcpip.TCPMaxRTOOption(rto)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
@@ -3307,8 +3423,8 @@ func TestFinImmediately(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3328,8 +3444,8 @@ func TestFinImmediately(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3350,8 +3466,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3361,8 +3477,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3382,8 +3498,8 @@ func TestFinRetransmit(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(791),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3406,8 +3522,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3431,8 +3547,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3453,8 +3569,8 @@ func TestFinWithNoPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3481,8 +3597,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 			checker.PayloadLen(len(view)+header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(next),
-				checker.AckNum(790),
+				checker.TCPSeqNum(next),
+				checker.TCPAckNum(790),
 				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			),
 		)
@@ -3500,8 +3616,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3520,8 +3636,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3541,8 +3657,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3565,8 +3681,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3590,8 +3706,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3606,8 +3722,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3627,8 +3743,8 @@ func TestFinWithPendingData(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -3652,8 +3768,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3673,8 +3789,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3688,8 +3804,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3704,8 +3820,8 @@ func TestFinWithPartialAck(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(791),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(791),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -3796,8 +3912,8 @@ func scaledSendWindow(t *testing.T, scale uint8) {
 		checker.PayloadLen((1<<scale)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -3935,7 +4051,7 @@ func TestReceivedSegmentQueuing(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -3962,8 +4078,9 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
 	// after 1 second in TIME_WAIT state.
 	tcpTimeWaitTimeout := 1 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -3985,8 +4102,8 @@ func TestReadAfterClosedState(t *testing.T) {
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
 		),
 	)
@@ -4010,8 +4127,8 @@ func TestReadAfterClosedState(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(uint32(791+len(data))),
+			checker.TCPSeqNum(uint32(c.IRS)+2),
+			checker.TCPAckNum(uint32(791+len(data))),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -4183,8 +4300,8 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 
 func TestDefaultBufferSizes(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	// Check the default values.
@@ -4202,11 +4319,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default send buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultSendBufferSize * 2,
-		Max:     tcp.DefaultSendBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultSendBufferSize * 2,
+			Max:     tcp.DefaultSendBufferSize * 20,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4219,11 +4340,15 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default receive buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{
-		Min:     1,
-		Default: tcp.DefaultReceiveBufferSize * 3,
-		Max:     tcp.DefaultReceiveBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{
+			Min:     1,
+			Default: tcp.DefaultReceiveBufferSize * 3,
+			Max:     tcp.DefaultReceiveBufferSize * 30,
+		}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	ep.Close()
@@ -4238,8 +4363,8 @@ func TestDefaultBufferSizes(t *testing.T) {
 
 func TestMinMaxBufferSizes(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	// Check the default values.
@@ -4250,22 +4375,28 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	defer ep.Close()
 
 	// Change the min/max values for send/receive
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPSendBufferSizeRangeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
-	// Set values below the min.
-	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
+	// Set values below the min/2.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 99); err != nil {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, 200)
 
-	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 149); err != nil {
 		t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
 	}
 
@@ -4276,19 +4407,21 @@ func TestMinMaxBufferSizes(t *testing.T) {
 		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 	}
 
-	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
+	// Values above max are capped at max and then doubled.
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
 		t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
 	}
 
-	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
+	// Values above max are capped at max and then doubled.
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2)
 }
 
 func TestBindToDeviceOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()}})
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol}})
 
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
@@ -4321,16 +4454,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%#v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %s, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -4338,11 +4470,11 @@ func TestBindToDeviceOption(t *testing.T) {
 
 func makeStack() (*stack.Stack, *tcpip.Error) {
 	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{
-			ipv4.NewProtocol(),
-			ipv6.NewProtocol(),
+		NetworkProtocols: []stack.NetworkProtocolFactory{
+			ipv4.NewProtocol,
+			ipv6.NewProtocol,
 		},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	id := loopback.New()
@@ -4411,7 +4543,7 @@ func TestSelfConnect(t *testing.T) {
 	}
 
 	<-notifyCh
-	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+	if err := ep.LastError(); err != nil {
 		t.Fatalf("Connect failed: %s", err)
 	}
 
@@ -4625,8 +4757,8 @@ func TestPathMTUDiscovery(t *testing.T) {
 				checker.PayloadLen(size+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(seqNum),
-					checker.AckNum(790),
+					checker.TCPSeqNum(seqNum),
+					checker.TCPAckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
 			)
@@ -4717,8 +4849,8 @@ func TestStackSetCongestionControl(t *testing.T) {
 				t.Fatalf("s.TransportProtocolOption(%v, %v) = %s", tcp.ProtocolNumber, &oldCC, err)
 			}
 
-			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
-				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &tc.cc); err != tc.err {
+				t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = %s, want = %s", tcp.ProtocolNumber, tc.cc, tc.cc, err, tc.err)
 			}
 
 			var cc tcpip.CongestionControlOption
@@ -4750,12 +4882,12 @@ func TestStackAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Query permitted congestion control algorithms.
-	var aCC tcpip.AvailableCongestionControlOption
+	var aCC tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
 	}
-	if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := aCC, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption: %v, want: %v", got, want)
 	}
 }
 
@@ -4766,18 +4898,18 @@ func TestStackSetAvailableCongestionControl(t *testing.T) {
 	s := c.Stack()
 
 	// Setting AvailableCongestionControlOption should fail.
-	aCC := tcpip.AvailableCongestionControlOption("xyz")
+	aCC := tcpip.TCPAvailableCongestionControlOption("xyz")
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%s)) = nil, want non-nil", tcp.ProtocolNumber, aCC, aCC)
 	}
 
 	// Verify that we still get the expected list of congestion control options.
-	var cc tcpip.AvailableCongestionControlOption
+	var cc tcpip.TCPAvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
-		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
+		t.Fatalf("s.TransportProtocolOptio(%d, &%T(%s)): %s", tcp.ProtocolNumber, cc, cc, err)
 	}
-	if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := cc, tcpip.TCPAvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.TCPAvailableCongestionControlOption = %s, want = %s", got, want)
 	}
 }
 
@@ -4806,20 +4938,20 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 
 				var oldCC tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&oldCC); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &oldCC, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T) = %s", oldCC, err)
 				}
 
 				if connected {
 					c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
 				}
 
-				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
-					t.Fatalf("c.EP.SetSockOpt(%v) = %s, want %s", tc.cc, err, tc.err)
+				if err := c.EP.SetSockOpt(&tc.cc); err != tc.err {
+					t.Fatalf("got c.EP.SetSockOpt(&%#v) = %s, want %s", tc.cc, err, tc.err)
 				}
 
 				var cc tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&cc); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %s", &cc, err)
+					t.Fatalf("c.EP.GetSockOpt(&%T): %s", cc, err)
 				}
 
 				got, want := cc, oldCC
@@ -4831,7 +4963,7 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 					want = tc.cc
 				}
 				if got != want {
-					t.Fatalf("got congestion control: %v, want: %v", got, want)
+					t.Fatalf("got congestion control = %+v, want = %+v", got, want)
 				}
 			})
 		}
@@ -4841,8 +4973,8 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 func enableCUBIC(t *testing.T, c *context.Context) {
 	t.Helper()
 	opt := tcpip.CongestionControlOption("cubic")
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %s = %s", opt, err)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)) %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 }
 
@@ -4852,11 +4984,23 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	keepAliveIdleOpt := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOpt, keepAliveIdle, err)
+	}
+	keepAliveIntervalOpt := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOpt, keepAliveInterval, err)
+	}
 	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -4865,8 +5009,8 @@ func TestKeepalive(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)),
-				checker.AckNum(uint32(790)),
+				checker.TCPSeqNum(uint32(c.IRS)),
+				checker.TCPAckNum(uint32(790)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -4899,8 +5043,8 @@ func TestKeepalive(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -4911,8 +5055,8 @@ func TestKeepalive(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
 		),
 	)
@@ -4937,8 +5081,8 @@ func TestKeepalive(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(next-1)),
-				checker.AckNum(uint32(790)),
+				checker.TCPSeqNum(uint32(next-1)),
+				checker.TCPAckNum(uint32(790)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
@@ -4964,8 +5108,8 @@ func TestKeepalive(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(next)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -4987,6 +5131,7 @@ func TestKeepalive(t *testing.T) {
 }
 
 func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	t.Helper()
 	// Send a SYN request.
 	irs = seqnum.Value(789)
 	c.SendPacket(nil, &context.Headers{
@@ -5005,7 +5150,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(srcPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 
 	if synCookieInUse {
@@ -5031,6 +5176,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 }
 
 func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	t.Helper()
 	// Send a SYN request.
 	irs = seqnum.Value(789)
 	c.SendV6Packet(nil, &context.Headers{
@@ -5049,7 +5195,7 @@ func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCoo
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(srcPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 
 	if synCookieInUse {
@@ -5094,13 +5240,14 @@ func TestListenBacklogFull(t *testing.T) {
 
 	// Test acceptance.
 	// Start listening.
-	listenBacklog := 2
+	listenBacklog := 10
 	if err := c.EP.Listen(listenBacklog); err != nil {
 		t.Fatalf("Listen failed: %s", err)
 	}
 
-	for i := 0; i < listenBacklog; i++ {
-		executeHandshake(t, c, context.TestPort+uint16(i), false /*synCookieInUse */)
+	lastPortOffset := uint16(0)
+	for ; int(lastPortOffset) < listenBacklog; lastPortOffset++ {
+		executeHandshake(t, c, context.TestPort+lastPortOffset, false /*synCookieInUse */)
 	}
 
 	time.Sleep(50 * time.Millisecond)
@@ -5108,7 +5255,7 @@ func TestListenBacklogFull(t *testing.T) {
 	// Now execute send one more SYN. The stack should not respond as the backlog
 	// is full at this point.
 	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort + 2,
+		SrcPort: context.TestPort + uint16(lastPortOffset),
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  seqnum.Value(789),
@@ -5122,12 +5269,12 @@ func TestListenBacklogFull(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	for i := 0; i < listenBacklog; i++ {
-		_, _, err = c.EP.Accept()
+		_, _, err = c.EP.Accept(nil)
 		if err == tcpip.ErrWouldBlock {
 			// Wait for connection to be established.
 			select {
 			case <-ch:
-				_, _, err = c.EP.Accept()
+				_, _, err = c.EP.Accept(nil)
 				if err != nil {
 					t.Fatalf("Accept failed: %s", err)
 				}
@@ -5139,7 +5286,7 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5149,14 +5296,14 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 
 	// Now a new handshake must succeed.
-	executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
+	executeHandshake(t, c, context.TestPort+lastPortOffset, false /*synCookieInUse */)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5181,6 +5328,8 @@ func TestListenBacklogFull(t *testing.T) {
 func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 	multicastAddr := tcpip.Address("\xe0\x00\x01\x02")
 	otherMulticastAddr := tcpip.Address("\xe0\x00\x01\x03")
+	subnet := context.StackAddrWithPrefix.Subnet()
+	subnetBroadcastAddr := subnet.Broadcast()
 
 	tests := []struct {
 		name    string
@@ -5188,53 +5337,59 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 		dstAddr tcpip.Address
 	}{
 		{
-			"SourceUnspecified",
-			header.IPv4Any,
-			context.StackAddr,
+			name:    "SourceUnspecified",
+			srcAddr: header.IPv4Any,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceBroadcast",
-			header.IPv4Broadcast,
-			context.StackAddr,
+			name:    "SourceBroadcast",
+			srcAddr: header.IPv4Broadcast,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceOurMulticast",
-			multicastAddr,
-			context.StackAddr,
+			name:    "SourceOurMulticast",
+			srcAddr: multicastAddr,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"SourceOtherMulticast",
-			otherMulticastAddr,
-			context.StackAddr,
+			name:    "SourceOtherMulticast",
+			srcAddr: otherMulticastAddr,
+			dstAddr: context.StackAddr,
 		},
 		{
-			"DestUnspecified",
-			context.TestAddr,
-			header.IPv4Any,
+			name:    "DestUnspecified",
+			srcAddr: context.TestAddr,
+			dstAddr: header.IPv4Any,
 		},
 		{
-			"DestBroadcast",
-			context.TestAddr,
-			header.IPv4Broadcast,
+			name:    "DestBroadcast",
+			srcAddr: context.TestAddr,
+			dstAddr: header.IPv4Broadcast,
 		},
 		{
-			"DestOurMulticast",
-			context.TestAddr,
-			multicastAddr,
+			name:    "DestOurMulticast",
+			srcAddr: context.TestAddr,
+			dstAddr: multicastAddr,
 		},
 		{
-			"DestOtherMulticast",
-			context.TestAddr,
-			otherMulticastAddr,
+			name:    "DestOtherMulticast",
+			srcAddr: context.TestAddr,
+			dstAddr: otherMulticastAddr,
+		},
+		{
+			name:    "SrcSubnetBroadcast",
+			srcAddr: subnetBroadcastAddr,
+			dstAddr: context.StackAddr,
+		},
+		{
+			name:    "DestSubnetBroadcast",
+			srcAddr: context.TestAddr,
+			dstAddr: subnetBroadcastAddr,
 		},
 	}
 
 	for _, test := range tests {
-		test := test // capture range variable
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
@@ -5275,7 +5430,7 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 					checker.SrcPort(context.StackPort),
 					checker.DstPort(context.TestPort),
 					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-					checker.AckNum(uint32(irs)+1)))
+					checker.TCPAckNum(uint32(irs)+1)))
 		})
 	}
 }
@@ -5283,8 +5438,8 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 // TestListenNoAcceptMulticastBroadcastV6 makes sure that TCP segments with a
 // non unicast IPv6 address are not accepted.
 func TestListenNoAcceptNonUnicastV6(t *testing.T) {
-	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
-	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
+	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
+	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
 
 	tests := []struct {
 		name    string
@@ -5334,11 +5489,7 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		test := test // capture range variable
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			c := context.New(t, defaultMTU)
 			defer c.Cleanup()
 
@@ -5379,7 +5530,7 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
 					checker.SrcPort(context.StackPort),
 					checker.DstPort(context.TestPort),
 					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-					checker.AckNum(uint32(irs)+1)))
+					checker.TCPAckNum(uint32(irs)+1)))
 		})
 	}
 }
@@ -5427,7 +5578,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5463,12 +5614,12 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5492,8 +5643,9 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	opt := tcpip.TCPSynRcvdCountThresholdOption(1)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create TCP endpoint.
@@ -5539,12 +5691,12 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5555,7 +5707,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	}
 
 	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
@@ -5604,7 +5756,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(irs) + 1),
+		checker.TCPAckNum(uint32(irs) + 1),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5625,8 +5777,8 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.AckNum(uint32(irs) + 1),
-		checker.SeqNum(uint32(iss + 1)),
+		checker.TCPAckNum(uint32(irs) + 1),
+		checker.TCPSeqNum(uint32(iss + 1)),
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
@@ -5644,7 +5796,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		RcvWnd:  30000,
 	})
 
-	newEP, _, err := c.EP.Accept()
+	newEP, _, err := c.EP.Accept(nil)
 
 	if err != nil && err != tcpip.ErrWouldBlock {
 		t.Fatalf("Accept failed: %s", err)
@@ -5659,7 +5811,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5717,12 +5869,12 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Verify that there is only one acceptable connection at this point.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5787,12 +5939,12 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	// Now check that there is one acceptable connections.
-	_, _, err = c.EP.Accept()
+	_, _, err = c.EP.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			_, _, err = c.EP.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5840,12 +5992,12 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	aep, _, err := ep.Accept()
+	aep, _, err := ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			aep, _, err = ep.Accept()
+			aep, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -5893,13 +6045,19 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 500.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size defined above.
@@ -5918,16 +6076,14 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	time.Sleep(latency)
 	rawEP.SendPacketWithTS([]byte{1}, tsVal)
 
-	// Verify that the ACK has the expected window.
-	wantRcvWnd := receiveBufferSize
-	wantRcvWnd = (wantRcvWnd >> uint32(c.WindowScale))
-	rawEP.VerifyACKRcvWnd(uint16(wantRcvWnd - 1))
+	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+	rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
 	time.Sleep(25 * time.Millisecond)
 
 	// Allocate a large enough payload for the test.
-	b := make([]byte, int(receiveBufferSize)*2)
-	offset := 0
-	payloadSize := receiveBufferSize - 1
+	payloadSize := receiveBufferSize * 2
+	b := make([]byte, int(payloadSize))
+
 	worker := (c.EP).(interface {
 		StopWork()
 		ResumeWork()
@@ -5936,11 +6092,15 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 
 	// Stop the worker goroutine.
 	worker.StopWork()
-	start := offset
-	end := offset + payloadSize
+	start := 0
+	end := payloadSize / 2
 	packetsSent := 0
 	for ; start < end; start += mss {
-		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+		packetEnd := start + mss
+		if start+mss > end {
+			packetEnd = end
+		}
+		rawEP.SendPacketWithTS(b[start:packetEnd], tsVal)
 		packetsSent++
 	}
 
@@ -5948,29 +6108,20 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// are waiting to be read.
 	worker.ResumeWork()
 
-	// Since we read no bytes the window should goto zero till the
-	// application reads some of the data.
-	// Discard all intermediate acks except the last one.
-	if packetsSent > 100 {
-		for i := 0; i < (packetsSent / 100); i++ {
-			_ = c.GetPacket()
-		}
+	// Since we sent almost the full receive buffer worth of data (some may have
+	// been dropped due to segment overheads), we should get a zero window back.
+	pkt = c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(pkt).Payload())
+	gotRcvWnd := tcpHdr.WindowSize()
+	wantAckNum := tcpHdr.AckNumber()
+	if got, want := int(gotRcvWnd), 0; got != want {
+		t.Fatalf("got rcvWnd: %d, want: %d", got, want)
 	}
-	rawEP.VerifyACKRcvWnd(0)
 
 	time.Sleep(25 * time.Millisecond)
-	// Verify that sending more data when window is closed is dropped and
-	// not acked.
+	// Verify that sending more data when receiveBuffer is exhausted.
 	rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
 
-	// Verify that the stack sends us back an ACK with the sequence number
-	// of the last packet sent indicating it was dropped.
-	p := c.GetPacket()
-	checker.IPv4(t, p, checker.TCP(
-		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
-		checker.Window(0),
-	))
-
 	// Now read all the data from the endpoint and verify that advertised
 	// window increases to the full available buffer size.
 	for {
@@ -5983,23 +6134,26 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// Verify that we receive a non-zero window update ACK. When running
 	// under thread santizer this test can end up sending more than 1
 	// ack, 1 for the non-zero window
-	p = c.GetPacket()
+	p := c.GetPacket()
 	checker.IPv4(t, p, checker.TCP(
-		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
+		checker.TCPAckNum(uint32(wantAckNum)),
 		func(t *testing.T, h header.Transport) {
 			tcp, ok := h.(header.TCP)
 			if !ok {
 				return
 			}
-			if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
-				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
+			// We use 10% here as the error margin upwards as the initial window we
+			// got was afer 1 segment was already in the receive buffer queue.
+			tolerance := 1.1
+			if w := tcp.WindowSize(); w == 0 || w > uint16(float64(rcvWnd)*tolerance) {
+				t.Errorf("expected a non-zero window: got %d, want <= %d", w, uint16(float64(rcvWnd)*tolerance))
 			}
 		},
 	))
 }
 
-// This test verifies that the auto tuning does not grow the receive buffer if
-// the application is not reading the data actively.
+// This test verifies that the advertised window is auto-tuned up as the
+// application is reading the data that is being received.
 func TestReceiveBufferAutoTuning(t *testing.T) {
 	const mtu = 1500
 	const mss = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
@@ -6009,26 +6163,33 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 
 	// Enable Auto-tuning.
 	stk := c.Stack()
-	// Set lower limits for auto-tuning tests. This is required because the
-	// test stops the worker which can cause packets to be dropped because
-	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+		}
 	}
 
 	// Enable auto-tuning.
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size used by stack.
 	c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize))
 
 	rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4})
-
-	wantRcvWnd := receiveBufferSize
+	tsVal := uint32(rawEP.TSVal)
+	rawEP.NextSeqNum--
+	rawEP.SendPacketWithTS(nil, tsVal)
+	rawEP.NextSeqNum++
+	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+	curRcvWnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.WindowScale
 	scaleRcvWnd := func(rcvWnd int) uint16 {
 		return uint16(rcvWnd >> uint16(c.WindowScale))
 	}
@@ -6045,14 +6206,8 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		StopWork()
 		ResumeWork()
 	})
-	tsVal := rawEP.TSVal
-	// We are going to do our own computation of what the moderated receive
-	// buffer should be based on sent/copied data per RTT and verify that
-	// the advertised window by the stack matches our calculations.
-	prevCopied := 0
-	done := false
 	latency := 1 * time.Millisecond
-	for i := 0; !done; i++ {
+	for i := 0; i < 5; i++ {
 		tsVal++
 
 		// Stop the worker goroutine.
@@ -6074,15 +6229,20 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		// Give 1ms for the worker to process the packets.
 		time.Sleep(1 * time.Millisecond)
 
-		// Verify that the advertised window on the ACK is reduced by
-		// the total bytes sent.
-		expectedWnd := wantRcvWnd - totalSent
-		if packetsSent > 100 {
-			for i := 0; i < (packetsSent / 100); i++ {
-				_ = c.GetPacket()
+		lastACK := c.GetPacket()
+		// Discard any intermediate ACKs and only check the last ACK we get in a
+		// short time period of few ms.
+		for {
+			time.Sleep(1 * time.Millisecond)
+			pkt := c.GetPacketNonBlocking()
+			if pkt == nil {
+				break
 			}
+			lastACK = pkt
+		}
+		if got, want := int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()), int(scaleRcvWnd(curRcvWnd)); got > want {
+			t.Fatalf("advertised window got: %d, want <= %d", got, want)
 		}
-		rawEP.VerifyACKRcvWnd(scaleRcvWnd(expectedWnd))
 
 		// Now read all the data from the endpoint and invoke the
 		// moderation API to allow for receive buffer auto-tuning
@@ -6112,30 +6272,28 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 			// In the first iteration the receiver based RTT is not
 			// yet known as a result the moderation code should not
 			// increase the advertised window.
-			rawEP.VerifyACKRcvWnd(scaleRcvWnd(wantRcvWnd))
-			prevCopied = totalCopied
+			rawEP.VerifyACKRcvWnd(scaleRcvWnd(curRcvWnd))
 		} else {
-			rttCopied := totalCopied
-			if i == 1 {
-				// The moderation code accumulates copied bytes till
-				// RTT is established. So add in the bytes sent in
-				// the first iteration to the total bytes for this
-				// RTT.
-				rttCopied += prevCopied
-				// Now reset it to the initial value used by the
-				// auto tuning logic.
-				prevCopied = tcp.InitialCwnd * mss * 2
+			// Read loop above could generate an ACK if the window had dropped to
+			// zero and then read had opened it up.
+			lastACK := c.GetPacket()
+			// Discard any intermediate ACKs and only check the last ACK we get in a
+			// short time period of few ms.
+			for {
+				time.Sleep(1 * time.Millisecond)
+				pkt := c.GetPacketNonBlocking()
+				if pkt == nil {
+					break
+				}
+				lastACK = pkt
 			}
-			newWnd := rttCopied<<1 + 16*mss
-			grow := (newWnd * (rttCopied - prevCopied)) / prevCopied
-			newWnd += (grow << 1)
-			if newWnd > maxReceiveBufferSize {
-				newWnd = maxReceiveBufferSize
-				done = true
+			curRcvWnd = int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()) << c.WindowScale
+			// If thew new current window is close maxReceiveBufferSize then terminate
+			// the loop. This can happen before all iterations are done due to timing
+			// differences when running the test.
+			if int(float64(curRcvWnd)*1.1) > maxReceiveBufferSize/2 {
+				break
 			}
-			rawEP.VerifyACKRcvWnd(scaleRcvWnd(newWnd))
-			wantRcvWnd = newWnd
-			prevCopied = rttCopied
 			// Increase the latency after first two iterations to
 			// establish a low RTT value in the receiver since it
 			// only tracks the lowest value. This ensures that when
@@ -6148,6 +6306,12 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		offset += payloadSize
 		payloadSize *= 2
 	}
+	// Check that at the end of our iterations the receive window grew close to the maximum
+	// permissible size of maxReceiveBufferSize/2
+	if got, want := int(float64(curRcvWnd)*1.1), maxReceiveBufferSize/2; got < want {
+		t.Fatalf("unexpected rcvWnd got: %d, want > %d", got, want)
+	}
+
 }
 
 func TestDelayEnabled(t *testing.T) {
@@ -6156,7 +6320,7 @@ func TestDelayEnabled(t *testing.T) {
 	checkDelayOption(t, c, false, false) // Delay is disabled by default.
 
 	for _, v := range []struct {
-		delayEnabled    tcp.DelayEnabled
+		delayEnabled    tcpip.TCPDelayEnabled
 		wantDelayOption bool
 	}{
 		{delayEnabled: false, wantDelayOption: false},
@@ -6164,17 +6328,17 @@ func TestDelayEnabled(t *testing.T) {
 	} {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
-			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %s", v.delayEnabled, err)
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &v.delayEnabled); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, v.delayEnabled, v.delayEnabled, err)
 		}
 		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
 	}
 }
 
-func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.TCPDelayEnabled, wantDelayOption bool) {
 	t.Helper()
 
-	var gotDelayEnabled tcp.DelayEnabled
+	var gotDelayEnabled tcpip.TCPDelayEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
 		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %s", err)
 	}
@@ -6206,24 +6370,27 @@ func TestTCPLingerTimeout(t *testing.T) {
 		tcpLingerTimeout time.Duration
 		want             time.Duration
 	}{
-		{"NegativeLingerTimeout", -123123, 0},
-		{"ZeroLingerTimeout", 0, 0},
+		{"NegativeLingerTimeout", -123123, -1},
+		// Zero is treated same as the stack's default TCP_LINGER2 timeout.
+		{"ZeroLingerTimeout", 0, tcp.DefaultTCPLingerTimeout},
 		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
 		// Values > stack's TCPLingerTimeout are capped to the stack's
 		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
-		{"AboveMaxLingerTimeout", 125 * time.Second, 120 * time.Second},
+		{"AboveMaxLingerTimeout", tcp.MaxTCPLingerTimeout + 5*time.Second, tcp.MaxTCPLingerTimeout},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
-				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			v := tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)
+			if err := c.EP.SetSockOpt(&v); err != nil {
+				t.Fatalf("SetSockOpt(&%T(%s)) = %s", v, tc.tcpLingerTimeout, err)
 			}
-			var v tcpip.TCPLingerTimeoutOption
+
+			v = 0
 			if err := c.EP.GetSockOpt(&v); err != nil {
-				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+				t.Fatalf("GetSockOpt(&%T) = %s", v, err)
 			}
 			if got, want := time.Duration(v), tc.want; got != want {
-				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+				t.Fatalf("got linger timeout = %s, want = %s", got, want)
 			}
 		})
 	}
@@ -6277,12 +6444,12 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6296,8 +6463,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6314,8 +6481,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Now send a RST and this should be ignored and not
@@ -6343,8 +6510,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 }
 
@@ -6396,12 +6563,12 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6415,8 +6582,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6433,8 +6600,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Out of order ACK should generate an immediate ACK in
@@ -6450,8 +6617,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 }
 
@@ -6503,12 +6670,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6522,8 +6689,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6540,8 +6707,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Send a SYN request w/ sequence number lower than
@@ -6558,6 +6725,13 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 
 	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
 
+	// drain any older notifications from the notification channel before attempting
+	// 2nd connection.
+	select {
+	case <-ch:
+	default:
+	}
+
 	// Send a SYN request w/ sequence number higher than
 	// the highest sequence number sent.
 	iss = seqnum.Value(792)
@@ -6586,12 +6760,12 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 	c.SendPacket(nil, ackHeaders)
 
 	// Try to accept the connection.
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6609,8 +6783,9 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
@@ -6659,12 +6834,12 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6678,8 +6853,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	finHeaders := &context.Headers{
@@ -6696,8 +6871,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	time.Sleep(2 * time.Second)
@@ -6711,8 +6886,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+2)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+2)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Sleep for 4 seconds so at this point we are 1 second past the
@@ -6740,8 +6915,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
@@ -6759,8 +6934,9 @@ func TestTCPCloseWithData(t *testing.T) {
 	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
 	// after 5 seconds in TIME_WAIT state.
 	tcpTimeWaitTimeout := 5 * time.Second
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	opt := tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, tcpTimeWaitTimeout, err)
 	}
 
 	wq := &waiter.Queue{}
@@ -6808,12 +6984,12 @@ func TestTCPCloseWithData(t *testing.T) {
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				t.Fatalf("Accept failed: %s", err)
 			}
@@ -6839,8 +7015,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+2),
+		checker.TCPSeqNum(uint32(c.IRS+1)),
+		checker.TCPAckNum(uint32(iss)+2),
 		checker.TCPFlags(header.TCPFlagAck)))
 
 	// Now write a few bytes and then close the endpoint.
@@ -6858,8 +7034,8 @@ func TestTCPCloseWithData(t *testing.T) {
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -6873,8 +7049,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
-		checker.AckNum(uint32(iss+2)),
+		checker.TCPSeqNum(uint32(c.IRS+1)+uint32(len(data))),
+		checker.TCPAckNum(uint32(iss+2)),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
 
 	// First send a partial ACK.
@@ -6919,8 +7095,8 @@ func TestTCPCloseWithData(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
-		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(0),
+		checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+		checker.TCPAckNum(0),
 		checker.TCPFlags(header.TCPFlagRst)))
 }
 
@@ -6940,7 +7116,10 @@ func TestTCPUserTimeout(t *testing.T) {
 	// expired.
 	initRTO := 1 * time.Second
 	userTimeout := initRTO / 2
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	v := tcpip.TCPUserTimeoutOption(userTimeout)
+	if err := c.EP.SetSockOpt(&v); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s): %s", v, userTimeout, err)
+	}
 
 	// Send some data and wait before ACKing it.
 	view := buffer.NewView(3)
@@ -6953,8 +7132,8 @@ func TestTCPUserTimeout(t *testing.T) {
 		checker.PayloadLen(len(view)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(next),
-			checker.AckNum(790),
+			checker.TCPSeqNum(next),
+			checker.TCPAckNum(790),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -6988,8 +7167,8 @@ func TestTCPUserTimeout(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(next)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -7014,18 +7193,31 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	const keepAliveIdle = 100 * time.Millisecond
 	const keepAliveInterval = 3 * time.Second
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
-	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+	keepAliveIdleOption := tcpip.KeepaliveIdleOption(keepAliveIdle)
+	if err := c.EP.SetSockOpt(&keepAliveIdleOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIdleOption, keepAliveIdle, err)
+	}
+	keepAliveIntervalOption := tcpip.KeepaliveIntervalOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&keepAliveIntervalOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", keepAliveIntervalOption, keepAliveInterval, err)
+	}
+	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil {
+		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
+		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
+	}
 
 	// Set userTimeout to be the duration to be 1 keepalive
 	// probes. Which means that after the first probe is sent
 	// the second one should cause the connection to be
 	// closed due to userTimeout being hit.
-	userTimeout := 1 * keepAliveInterval
-	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+	userTimeout := tcpip.TCPUserTimeoutOption(keepAliveInterval)
+	if err := c.EP.SetSockOpt(&userTimeout); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", userTimeout, keepAliveInterval, err)
+	}
 
 	// Check that the connection is still alive.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
@@ -7037,8 +7229,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	checker.IPv4(t, b,
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)),
-			checker.AckNum(uint32(790)),
+			checker.TCPSeqNum(uint32(c.IRS)),
+			checker.TCPAckNum(uint32(790)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7063,8 +7255,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS+1)),
-			checker.AckNum(uint32(0)),
+			checker.TCPSeqNum(uint32(c.IRS+1)),
+			checker.TCPAckNum(uint32(0)),
 			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
@@ -7080,9 +7272,9 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	}
 }
 
-func TestIncreaseWindowOnReceive(t *testing.T) {
+func TestIncreaseWindowOnRead(t *testing.T) {
 	// This test ensures that the endpoint sends an ack,
-	// after recv() when the window grows to more than 1 MSS.
+	// after read() when the window grows by more than 1 MSS.
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
@@ -7091,10 +7283,9 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
 
 	// Write chunks of ~30000 bytes. It's important that two
 	// payloads make it equal or longer than MSS.
-	remain := rcvBuf
+	remain := rcvBuf * 2
 	sent := 0
 	data := make([]byte, defaultMTU/2)
-	lastWnd := uint16(0)
 
 	for remain > len(data) {
 		c.SendPacket(data, &context.Headers{
@@ -7107,46 +7298,43 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
 		})
 		sent += len(data)
 		remain -= len(data)
-
-		lastWnd = uint16(remain)
-		if remain > 0xffff {
-			lastWnd = 0xffff
-		}
-		checker.IPv4(t, c.GetPacket(),
+		pkt := c.GetPacket()
+		checker.IPv4(t, pkt,
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(lastWnd),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
+		// Break once the window drops below defaultMTU/2
+		if wnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize(); wnd < defaultMTU/2 {
+			break
+		}
 	}
 
-	if lastWnd == 0xffff || lastWnd == 0 {
-		t.Fatalf("expected small, non-zero window: %d", lastWnd)
-	}
-
-	// We now have < 1 MSS in the buffer space. Read the data! An
-	// ack should be sent in response to that. The window was not
-	// zero, but it grew to larger than MSS.
-	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %s", err)
-	}
-
-	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %s", err)
+	// We now have < 1 MSS in the buffer space. Read at least > 2 MSS
+	// worth of data as receive buffer space
+	read := 0
+	// defaultMTU is a good enough estimate for the MSS used for this
+	// connection.
+	for read < defaultMTU*2 {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %s", err)
+		}
+		read += len(v)
 	}
 
-	// After reading two packets, we surely crossed MSS. See the ack:
+	// After reading > MSS worth of data, we surely crossed MSS. See the ack:
 	checker.IPv4(t, c.GetPacket(),
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(0xffff)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindow(uint16(0xffff)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7166,7 +7354,6 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 	remain := rcvBuf
 	sent := 0
 	data := make([]byte, defaultMTU/2)
-	lastWnd := uint16(0)
 
 	for remain > len(data) {
 		c.SendPacket(data, &context.Headers{
@@ -7179,39 +7366,29 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		})
 		sent += len(data)
 		remain -= len(data)
-
-		lastWnd = uint16(remain)
-		if remain > 0xffff {
-			lastWnd = 0xffff
-		}
 		checker.IPv4(t, c.GetPacket(),
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
-				checker.AckNum(uint32(790+sent)),
-				checker.Window(lastWnd),
+				checker.TCPSeqNum(uint32(c.IRS)+1),
+				checker.TCPAckNum(uint32(790+sent)),
+				checker.TCPWindowLessThanEq(0xffff),
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
 	}
 
-	if lastWnd == 0xffff || lastWnd == 0 {
-		t.Fatalf("expected small, non-zero window: %d", lastWnd)
-	}
-
 	// Increasing the buffer from should generate an ACK,
 	// since window grew from small value to larger equal MSS
 	c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
 
-	// After reading two packets, we surely crossed MSS. See the ack:
 	checker.IPv4(t, c.GetPacket(),
 		checker.PayloadLen(header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(790+sent)),
-			checker.Window(uint16(0xffff)),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(790+sent)),
+			checker.TCPWindow(uint16(0xffff)),
 			checker.TCPFlags(header.TCPFlagAck),
 		),
 	)
@@ -7232,14 +7409,15 @@ func TestTCPDeferAccept(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOption := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOption); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)): %s", tcpDeferAcceptOption, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send data. This should result in an acceptable endpoint.
@@ -7255,14 +7433,14 @@ func TestTCPDeferAccept(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 
 	// Give a bit of time for the socket to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7270,8 +7448,8 @@ func TestTCPDeferAccept(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestTCPDeferAcceptTimeout(t *testing.T) {
@@ -7289,14 +7467,15 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 	}
 
 	const tcpDeferAccept = 1 * time.Second
-	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	tcpDeferAcceptOpt := tcpip.TCPDeferAcceptOption(tcpDeferAccept)
+	if err := c.EP.SetSockOpt(&tcpDeferAcceptOpt); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(&%T(%s)) failed: %s", tcpDeferAcceptOpt, tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
-	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	if _, _, err := c.EP.Accept(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Sleep for a little of the tcpDeferAccept timeout.
@@ -7307,7 +7486,7 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
-		checker.AckNum(uint32(irs)+1)))
+		checker.TCPAckNum(uint32(irs)+1)))
 
 	// Send data. This should result in an acceptable endpoint.
 	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
@@ -7323,14 +7502,14 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 
 	// Give sometime for the endpoint to be delivered to the accept queue.
 	time.Sleep(50 * time.Millisecond)
-	aep, _, err := c.EP.Accept()
+	aep, _, err := c.EP.Accept(nil)
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+		t.Fatalf("got c.EP.Accept(nil) = %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7339,8 +7518,8 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
-		checker.SeqNum(uint32(iss+1)),
-		checker.AckNum(uint32(irs+5))))
+		checker.TCPSeqNum(uint32(iss+1)),
+		checker.TCPAckNum(uint32(irs+5))))
 }
 
 func TestResetDuringClose(t *testing.T) {
@@ -7365,8 +7544,8 @@ func TestResetDuringClose(t *testing.T) {
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(irs.Add(1))),
-		checker.AckNum(uint32(iss.Add(5)))))
+		checker.TCPSeqNum(uint32(irs.Add(1))),
+		checker.TCPAckNum(uint32(iss.Add(5)))))
 
 	// Close in a separate goroutine so that we can trigger
 	// a race with the RST we send below. This should not
@@ -7428,9 +7607,10 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitReuseOption(tc.v))
+		opt := tcpip.TCPTimeWaitReuseOption(tc.v)
+		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)
 		if got, want := err, tc.err; got != want {
-			t.Fatalf("s.TransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.v, err, tc.err)
+			t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)) = %s, want = %s", tcp.ProtocolNumber, tc.v, tc.v, err, tc.err)
 		}
 		if tc.err != nil {
 			continue
@@ -7446,3 +7626,14 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
 		}
 	}
 }
+
+// generateRandomPayload generates a random byte slice of the specified length
+// causing a fatal test failure if it is unable to do so.
+func generateRandomPayload(t *testing.T, n int) []byte {
+	t.Helper()
+	buf := make([]byte, n)
+	if _, err := rand.Read(buf); err != nil {
+		t.Fatalf("rand.Read(buf) failed: %s", err)
+	}
+	return buf
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 8edbff964..0f9ed06cd 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -131,8 +131,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -158,9 +159,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 		checker.PayloadLen(len(data)+header.TCPMinimumSize+12),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(wndSize),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(wndSize),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			checker.TCPTimestampChecker(true, 0, tsVal+1),
 		),
@@ -180,7 +181,8 @@ func TestTimeStampEnabledAccept(t *testing.T) {
 		wndSize       uint16
 	}{
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
-		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+		// DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be 1/2 of that.
+		{false, 5, 0x4000},
 	}
 	for _, tc := range testCases {
 		timeStampEnabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
@@ -192,8 +194,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
-			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		var opt tcpip.TCPSynRcvdCountThresholdOption
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -217,9 +220,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.Window(wndSize),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(790),
+			checker.TCPWindow(wndSize),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 			checker.TCPTimestampChecker(false, 0, 0),
 		),
@@ -235,7 +238,9 @@ func TestTimeStampDisabledAccept(t *testing.T) {
 		wndSize       uint16
 	}{
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
-		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+		// DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be half of
+		// that.
+		{false, 5, 0x4000},
 	}
 	for _, tc := range testCases {
 		timeStampDisabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index b6031354e..4d7847142 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -53,11 +53,11 @@ const (
 	TestPort = 4096
 
 	// StackV6Addr is the IPv6 address assigned to the stack.
-	StackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	StackV6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
 
 	// TestV6Addr is the source address for packets sent to the stack via
 	// the link layer endpoint.
-	TestV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	TestV6Addr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
 	// StackV4MappedAddr is StackAddr as a mapped v6 address.
 	StackV4MappedAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + StackAddr
@@ -68,11 +68,23 @@ const (
 	// V4MappedWildcardAddr is the mapped v6 representation of 0.0.0.0.
 	V4MappedWildcardAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
 
-	// testInitialSequenceNumber is the initial sequence number sent in packets that
+	// TestInitialSequenceNumber is the initial sequence number sent in packets that
 	// are sent in response to a SYN or in the initial SYN sent to the stack.
-	testInitialSequenceNumber = 789
+	TestInitialSequenceNumber = 789
 )
 
+// StackAddrWithPrefix is StackAddr with its associated prefix length.
+var StackAddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   StackAddr,
+	PrefixLen: 24,
+}
+
+// StackV6AddrWithPrefix is StackV6Addr with its associated prefix length.
+var StackV6AddrWithPrefix = tcpip.AddressWithPrefix{
+	Address:   StackV6Addr,
+	PrefixLen: header.IIDOffsetInIPv6Address * 8,
+}
+
 // Headers is used to represent the TCP header fields when building a
 // new packet.
 type Headers struct {
@@ -133,32 +145,39 @@ type Context struct {
 	// WindowScale is the expected window scale in SYN packets sent by
 	// the stack.
 	WindowScale uint8
+
+	// RcvdWindowScale is the actual window scale sent by the stack in
+	// SYN/SYN-ACK.
+	RcvdWindowScale uint8
 }
 
 // New allocates and initializes a test context containing a new
 // stack and a link-layer endpoint.
 func New(t *testing.T, mtu uint32) *Context {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol},
 	})
 
 	const sendBufferSize = 1 << 20 // 1 MiB
 	const recvBufferSize = 1 << 20 // 1 MiB
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	sendBufOpt := tcpip.TCPSendBufferSizeRangeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &sendBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, sendBufOpt, err)
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	rcvBufOpt := tcpip.TCPReceiveBufferSizeRangeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &rcvBufOpt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v) failed: %s", tcp.ProtocolNumber, rcvBufOpt, err)
 	}
 
 	// Increase minimum RTO in tests to avoid test flakes due to early
 	// retransmit in case the test executors are overloaded and cause timers
 	// to fire earlier than expected.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
-		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	minRTOOpt := tcpip.TCPMinRTOOption(3 * time.Second)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &minRTOOpt); err != nil {
+		t.Fatalf("s.SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, minRTOOpt, minRTOOpt, err)
 	}
 
 	// Some of the congestion control tests send up to 640 packets, we so
@@ -181,12 +200,20 @@ func New(t *testing.T, mtu uint32) *Context {
 		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts2, err)
 	}
 
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, StackAddr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+	v4ProtocolAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv4.ProtocolNumber,
+		AddressWithPrefix: StackAddrWithPrefix,
+	}
+	if err := s.AddProtocolAddress(1, v4ProtocolAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %#v): %s", v4ProtocolAddr, err)
 	}
 
-	if err := s.AddAddress(1, ipv6.ProtocolNumber, StackV6Addr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+	v6ProtocolAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv6.ProtocolNumber,
+		AddressWithPrefix: StackV6AddrWithPrefix,
+	}
+	if err := s.AddProtocolAddress(1, v6ProtocolAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %#v): %s", v6ProtocolAddr, err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{
@@ -238,18 +265,17 @@ func (c *Context) CheckNoPacket(errMsg string) {
 	c.CheckNoPacketTimeout(errMsg, 1*time.Second)
 }
 
-// GetPacket reads a packet from the link layer endpoint and verifies
+// GetPacketWithTimeout reads a packet from the link layer endpoint and verifies
 // that it is an IPv4 packet with the expected source and destination
-// addresses. It will fail with an error if no packet is received for
-// 2 seconds.
-func (c *Context) GetPacket() []byte {
+// addresses. If no packet is received in the specified timeout it will return
+// nil.
+func (c *Context) GetPacketWithTimeout(timeout time.Duration) []byte {
 	c.t.Helper()
 
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
-		c.t.Fatalf("Packet wasn't written out")
 		return nil
 	}
 
@@ -257,6 +283,14 @@ func (c *Context) GetPacket() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
+	// Just check that the stack set the transport protocol number for outbound
+	// TCP messages.
+	// TODO(gvisor.dev/issues/3810): Remove when protocol numbers are part
+	// of the headerinfo.
+	if p.Pkt.TransportProtocolNumber != tcp.ProtocolNumber {
+		c.t.Fatalf("got p.Pkt.TransportProtocolNumber = %d, want = %d", p.Pkt.TransportProtocolNumber, tcp.ProtocolNumber)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -268,6 +302,21 @@ func (c *Context) GetPacket() []byte {
 	return b
 }
 
+// GetPacket reads a packet from the link layer endpoint and verifies
+// that it is an IPv4 packet with the expected source and destination
+// addresses.
+func (c *Context) GetPacket() []byte {
+	c.t.Helper()
+
+	p := c.GetPacketWithTimeout(5 * time.Second)
+	if p == nil {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	return p
+}
+
 // GetPacketNonBlocking reads a packet from the link layer endpoint
 // and verifies that it is an IPv4 packet with the expected source
 // and destination address. If no packet is available it will return
@@ -284,6 +333,14 @@ func (c *Context) GetPacketNonBlocking() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
+	// Just check that the stack set the transport protocol number for outbound
+	// TCP messages.
+	// TODO(gvisor.dev/issues/3810): Remove when protocol numbers are part
+	// of the headerinfo.
+	if p.Pkt.TransportProtocolNumber != tcp.ProtocolNumber {
+		c.t.Fatalf("got p.Pkt.TransportProtocolNumber = %d, want = %d", p.Pkt.TransportProtocolNumber, tcp.ProtocolNumber)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -447,8 +504,8 @@ func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, op
 		checker.PayloadLen(size+header.TCPMinimumSize+optlen),
 		checker.TCP(
 			checker.DstPort(TestPort),
-			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
-			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.TCPAckNum(uint32(seqnum.Value(TestInitialSequenceNumber).Add(1))),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -474,8 +531,8 @@ func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int
 		checker.PayloadLen(size+header.TCPMinimumSize),
 		checker.TCP(
 			checker.DstPort(TestPort),
-			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
-			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.TCPAckNum(uint32(seqnum.Value(TestInitialSequenceNumber).Add(1))),
 			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 		),
 	)
@@ -613,6 +670,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 	}
 
 	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	synOpts := header.ParseSynOptions(tcpHdr.Options(), false /* isAck */)
 	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
 	c.SendPacket(nil, &Headers{
@@ -630,15 +688,15 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 		checker.TCP(
 			checker.DstPort(TestPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(uint32(iss)+1),
+			checker.TCPSeqNum(uint32(c.IRS)+1),
+			checker.TCPAckNum(uint32(iss)+1),
 		),
 	)
 
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -648,6 +706,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
 		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
+	c.RcvdWindowScale = uint8(synOpts.WS)
 	c.Port = tcpHdr.SourcePort()
 }
 
@@ -719,17 +778,18 @@ func (r *RawEndpoint) SendPacket(payload []byte, opts []byte) {
 	r.NextSeqNum = r.NextSeqNum.Add(seqnum.Size(len(payload)))
 }
 
-// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
-// tsVal.
-func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+// VerifyAndReturnACKWithTS verifies that the tsEcr field int he ACK matches
+// the provided tsVal as well as returns the original packet.
+func (r *RawEndpoint) VerifyAndReturnACKWithTS(tsVal uint32) []byte {
+	r.C.t.Helper()
 	// Read ACK and verify that tsEcr of ACK packet is [1,2,3,4]
 	ackPacket := r.C.GetPacket()
 	checker.IPv4(r.C.t, ackPacket,
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
 			checker.TCPTimestampChecker(true, 0, tsVal),
 		),
 	)
@@ -737,19 +797,28 @@ func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
 	tcpSeg := header.TCP(header.IPv4(ackPacket).Payload())
 	opts := tcpSeg.ParsedOptions()
 	r.RecentTS = opts.TSVal
+	return ackPacket
+}
+
+// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
+// tsVal.
+func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+	r.C.t.Helper()
+	_ = r.VerifyAndReturnACKWithTS(tsVal)
 }
 
 // VerifyACKRcvWnd verifies that the window advertised by the incoming ACK
 // matches the provided rcvWnd.
 func (r *RawEndpoint) VerifyACKRcvWnd(rcvWnd uint16) {
+	r.C.t.Helper()
 	ackPacket := r.C.GetPacket()
 	checker.IPv4(r.C.t, ackPacket,
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
-			checker.Window(rcvWnd),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
+			checker.TCPWindow(rcvWnd),
 		),
 	)
 }
@@ -768,8 +837,8 @@ func (r *RawEndpoint) VerifyACKHasSACK(sackBlocks []header.SACKBlock) {
 		checker.TCP(
 			checker.DstPort(r.SrcPort),
 			checker.TCPFlags(header.TCPFlagAck),
-			checker.SeqNum(uint32(r.AckNum)),
-			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPSeqNum(uint32(r.AckNum)),
+			checker.TCPAckNum(uint32(r.NextSeqNum)),
 			checker.TCPSACKBlockChecker(sackBlocks),
 		),
 	)
@@ -843,7 +912,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 
 	// Build SYN-ACK.
 	c.IRS = seqnum.Value(tcpSeg.SequenceNumber())
-	iss := seqnum.Value(testInitialSequenceNumber)
+	iss := seqnum.Value(TestInitialSequenceNumber)
 	c.SendPacket(nil, &Headers{
 		SrcPort: tcpSeg.DestinationPort(),
 		DstPort: tcpSeg.SourcePort(),
@@ -861,8 +930,8 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	tcpCheckers := []checker.TransportChecker{
 		checker.DstPort(TestPort),
 		checker.TCPFlags(header.TCPFlagAck),
-		checker.SeqNum(uint32(c.IRS) + 1),
-		checker.AckNum(uint32(iss) + 1),
+		checker.TCPSeqNum(uint32(c.IRS) + 1),
+		checker.TCPAckNum(uint32(iss) + 1),
 	}
 
 	// Verify that tsEcr of ACK packet is wantOptions.TSVal if the
@@ -882,8 +951,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.LastError(); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
@@ -898,7 +966,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 
 	// Mark in context that timestamp option is enabled for this endpoint.
 	c.TimeStampEnabled = true
-
+	c.RcvdWindowScale = uint8(synOptions.WS)
 	return &RawEndpoint{
 		C:             c,
 		SrcPort:       tcpSeg.DestinationPort(),
@@ -949,12 +1017,12 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
 
-	c.EP, _, err = ep.Accept()
+	c.EP, _, err = ep.Accept(nil)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			c.EP, _, err = ep.Accept()
+			c.EP, _, err = ep.Accept(nil)
 			if err != nil {
 				c.t.Fatalf("Accept failed: %v", err)
 			}
@@ -991,6 +1059,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP
 // value of the window scaling option to be sent in the SYN. If synOptions.WS >
 // 0 then we send the WindowScale option.
 func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
+	c.t.Helper()
 	opts := make([]byte, header.TCPOptionsMaximumSize)
 	offset := 0
 	offset += header.EncodeMSSOption(uint32(maxPayload), opts)
@@ -1015,7 +1084,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	offset += paddingToAdd
 
 	// Send a SYN request.
-	iss := seqnum.Value(testInitialSequenceNumber)
+	iss := seqnum.Value(TestInitialSequenceNumber)
 	c.SendPacket(nil, &Headers{
 		SrcPort: TestPort,
 		DstPort: StackPort,
@@ -1029,13 +1098,14 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	// are present.
 	b := c.GetPacket()
 	tcp := header.TCP(header.IPv4(b).Payload())
+	rcvdSynOptions := header.ParseSynOptions(tcp.Options(), true /* isAck */)
 	c.IRS = seqnum.Value(tcp.SequenceNumber())
 
 	tcpCheckers := []checker.TransportChecker{
 		checker.SrcPort(StackPort),
 		checker.DstPort(TestPort),
 		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
-		checker.AckNum(uint32(iss) + 1),
+		checker.TCPAckNum(uint32(iss) + 1),
 		checker.TCPSynOptions(header.TCPSynOptions{MSS: synOptions.MSS, WS: wndScale, SACKPermitted: synOptions.SACKPermitted && c.SACKEnabled()}),
 	}
 
@@ -1078,6 +1148,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 	// Send ACK.
 	c.SendPacket(nil, ackHeaders)
 
+	c.RcvdWindowScale = uint8(rcvdSynOptions.WS)
 	c.Port = StackPort
 
 	return &RawEndpoint{
@@ -1097,7 +1168,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
 // SACKEnabled returns true if the TCP Protocol option SACKEnabled is set to true
 // for the Stack in the context.
 func (c *Context) SACKEnabled() bool {
-	var v tcp.SACKEnabled
+	var v tcpip.TCPSACKEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &v); err != nil {
 		// Stack doesn't support SACK. So just return.
 		return false
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index b5d2d0ba6..c78549424 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/header/parse",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 73608783c..d31177eb7 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -139,7 +139,7 @@ type endpoint struct {
 
 	// multicastMemberships that need to be remvoed when the endpoint is
 	// closed. Protected by the mu mutex.
-	multicastMemberships []multicastMembership
+	multicastMemberships map[multicastMembership]struct{}
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -154,6 +154,9 @@ type endpoint struct {
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
+
+	// linger is used for SO_LINGER socket option.
+	linger tcpip.LingerOption
 }
 
 // +stateify savable
@@ -182,12 +185,13 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		// TTL=1.
 		//
 		// Linux defaults to TTL=1.
-		multicastTTL:  1,
-		multicastLoop: true,
-		rcvBufSizeMax: 32 * 1024,
-		sndBufSizeMax: 32 * 1024,
-		state:         StateInitial,
-		uniqueID:      s.UniqueID(),
+		multicastTTL:         1,
+		multicastLoop:        true,
+		rcvBufSizeMax:        32 * 1024,
+		sndBufSizeMax:        32 * 1024,
+		multicastMemberships: make(map[multicastMembership]struct{}),
+		state:                StateInitial,
+		uniqueID:             s.UniqueID(),
 	}
 
 	// Override with stack defaults.
@@ -209,7 +213,7 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
-func (e *endpoint) takeLastError() *tcpip.Error {
+func (e *endpoint) LastError() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 
@@ -237,10 +241,10 @@ func (e *endpoint) Close() {
 		e.boundPortFlags = ports.Flags{}
 	}
 
-	for _, mem := range e.multicastMemberships {
+	for mem := range e.multicastMemberships {
 		e.stack.LeaveGroup(e.NetProto, mem.nicID, mem.multicastAddr)
 	}
-	e.multicastMemberships = nil
+	e.multicastMemberships = make(map[multicastMembership]struct{})
 
 	// Close the receive list and drain it.
 	e.rcvMu.Lock()
@@ -268,7 +272,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {}
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
@@ -411,7 +415,7 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
-	if err := e.takeLastError(); err != nil {
+	if err := e.LastError(); err != nil {
 		return 0, nil, err
 	}
 
@@ -683,9 +687,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 }
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.MulticastInterfaceOption:
+	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
@@ -721,7 +725,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastNICID = nic
 		e.multicastAddr = addr
 
-	case tcpip.AddMembershipOption:
+	case *tcpip.AddMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -752,19 +756,17 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
-		for _, mem := range e.multicastMemberships {
-			if mem == memToInsert {
-				return tcpip.ErrPortInUse
-			}
+		if _, ok := e.multicastMemberships[memToInsert]; ok {
+			return tcpip.ErrPortInUse
 		}
 
 		if err := e.stack.JoinGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
 			return err
 		}
 
-		e.multicastMemberships = append(e.multicastMemberships, memToInsert)
+		e.multicastMemberships[memToInsert] = struct{}{}
 
-	case tcpip.RemoveMembershipOption:
+	case *tcpip.RemoveMembershipOption:
 		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
 			return tcpip.ErrInvalidOptionValue
 		}
@@ -786,18 +788,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 
 		memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}
-		memToRemoveIndex := -1
 
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
-		for i, mem := range e.multicastMemberships {
-			if mem == memToRemove {
-				memToRemoveIndex = i
-				break
-			}
-		}
-		if memToRemoveIndex == -1 {
+		if _, ok := e.multicastMemberships[memToRemove]; !ok {
 			return tcpip.ErrBadLocalAddress
 		}
 
@@ -805,11 +800,10 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			return err
 		}
 
-		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
-		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
+		delete(e.multicastMemberships, memToRemove)
 
-	case tcpip.BindToDeviceOption:
-		id := tcpip.NICID(v)
+	case *tcpip.BindToDeviceOption:
+		id := tcpip.NICID(*v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
@@ -817,8 +811,13 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.bindToDevice = id
 		e.mu.Unlock()
 
-	case tcpip.SocketDetachFilterOption:
+	case *tcpip.SocketDetachFilterOption:
 		return nil
+
+	case *tcpip.LingerOption:
+		e.mu.Lock()
+		e.linger = *v
+		e.mu.Unlock()
 	}
 	return nil
 }
@@ -896,6 +895,9 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 		return v, nil
 
+	case tcpip.AcceptConnOption:
+		return false, nil
+
 	default:
 		return false, tcpip.ErrUnknownProtocolOption
 	}
@@ -960,10 +962,8 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return e.takeLastError()
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
@@ -977,6 +977,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.mu.RUnlock()
 
+	case *tcpip.LingerOption:
+		e.mu.RLock()
+		*o = e.linger
+		e.mu.RUnlock()
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -994,6 +999,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 
 	// Initialize the UDP header.
 	udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize))
+	pkt.TransportProtocolNumber = ProtocolNumber
 
 	length := uint16(pkt.Size())
 	udp.Encode(&header.UDPFields{
@@ -1220,13 +1226,13 @@ func (*endpoint) Listen(int) *tcpip.Error {
 }
 
 // Accept is not supported by UDP, it just fails.
-func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	return nil, nil, tcpip.ErrNotSupported
 }
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{}, nil /* testPort */)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
@@ -1366,6 +1372,22 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return result
 }
 
+// verifyChecksum verifies the checksum unless RX checksum offload is enabled.
+// On IPv4, UDP checksum is optional, and a zero value means the transmitter
+// omitted the checksum generation (RFC768).
+// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+func verifyChecksum(r *stack.Route, hdr header.UDP, pkt *stack.PacketBuffer) bool {
+	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
+		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
+		for _, v := range pkt.Data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		return hdr.CalculateChecksum(xsum) == 0xffff
+	}
+	return true
+}
+
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
@@ -1378,33 +1400,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		return
 	}
 
-	// Never receive from a multicast address.
-	if header.IsV4MulticastAddress(id.RemoteAddress) ||
-		header.IsV6MulticastAddress(id.RemoteAddress) {
-		e.stack.Stats().UDP.InvalidSourceAddress.Increment()
-		e.stack.Stats().IP.InvalidSourceAddressesReceived.Increment()
-		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+	if !verifyChecksum(r, hdr, pkt) {
+		// Checksum Error.
+		e.stack.Stats().UDP.ChecksumErrors.Increment()
+		e.stats.ReceiveErrors.ChecksumErrors.Increment()
 		return
 	}
 
-	// Verify checksum unless RX checksum offload is enabled.
-	// On IPv4, UDP checksum is optional, and a zero value means
-	// the transmitter omitted the checksum generation (RFC768).
-	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
-	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
-		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
-		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
-		for _, v := range pkt.Data.Views() {
-			xsum = header.Checksum(v, xsum)
-		}
-		if hdr.CalculateChecksum(xsum) != 0xffff {
-			// Checksum Error.
-			e.stack.Stats().UDP.ChecksumErrors.Increment()
-			e.stats.ReceiveErrors.ChecksumErrors.Increment()
-			return
-		}
-	}
-
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
 
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 851e6b635..858c99a45 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -92,7 +92,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 
 	e.stack = s
 
-	for _, m := range e.multicastMemberships {
+	for m := range e.multicastMemberships {
 		if err := e.stack.JoinGroup(e.NetProto, m.nicID, m.multicastAddr); err != nil {
 			panic(err)
 		}
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index c67e0ba95..3ae6cc221 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -81,6 +81,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.ID = r.id
 	ep.route = r.route.Clone()
 	ep.dstPort = r.id.RemotePort
+	ep.effectiveNetProtos = []tcpip.NetworkProtocolNumber{r.route.NetProto}
 	ep.RegisterNICID = r.route.NICID()
 	ep.boundPortFlags = ep.portFlags
 
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 63d4bed7c..da5b1deb2 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -12,18 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package udp contains the implementation of the UDP transport protocol. To use
-// it in the networking stack, this package must be added to the project, and
-// activated on the stack by passing udp.NewProtocol() as one of the
-// transport protocols when calling stack.New(). Then endpoints can be created
-// by passing udp.ProtocolNumber as the transport protocol number when calling
-// Stack.NewEndpoint().
+// Package udp contains the implementation of the UDP transport protocol.
 package udp
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -49,6 +45,7 @@ const (
 )
 
 type protocol struct {
+	stack *stack.Stack
 }
 
 // Number returns the udp protocol number.
@@ -57,14 +54,14 @@ func (*protocol) Number() tcpip.TransportProtocolNumber {
 }
 
 // NewEndpoint creates a new udp endpoint.
-func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newEndpoint(stack, netProto, waiterQueue), nil
+func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(p.stack, netProto, waiterQueue), nil
 }
 
 // NewRawEndpoint creates a new raw UDP endpoint. It implements
 // stack.TransportProtocol.NewRawEndpoint.
-func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return raw.NewEndpoint(stack, netProto, header.UDPProtocolNumber, waiterQueue)
+func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(p.stack, netProto, header.UDPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid udp packet size.
@@ -79,130 +76,30 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 	return h.SourcePort(), h.DestinationPort(), nil
 }
 
-// HandleUnknownDestinationPacket handles packets targeted at this protocol but
-// that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+// HandleUnknownDestinationPacket handles packets that are targeted at this
+// protocol but don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
 	hdr := header.UDP(pkt.TransportHeader().View())
 	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
-		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
-		return true
-	}
-	// TODO(b/129426613): only send an ICMP message if UDP checksum is valid.
-
-	// Only send ICMP error if the address is not a multicast/broadcast
-	// v4/v6 address or the source is not the unspecified address.
-	//
-	// See: point e) in https://tools.ietf.org/html/rfc4443#section-2.4
-	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) || id.RemoteAddress == header.IPv6Any || id.RemoteAddress == header.IPv4Any {
-		return true
+		return stack.UnknownDestinationPacketMalformed
 	}
 
-	// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
-	//   Unreachable messages with code:
-	//
-	//     2 (Protocol Unreachable), when the designated transport protocol
-	//     is not supported; or
-	//
-	//     3 (Port Unreachable), when the designated transport protocol
-	//     (e.g., UDP) is unable to demultiplex the datagram but has no
-	//     protocol mechanism to inform the sender.
-	switch len(id.LocalAddress) {
-	case header.IPv4AddressSize:
-		if !r.Stack().AllowICMPMessage() {
-			r.Stack().Stats().ICMP.V4PacketsSent.RateLimited.Increment()
-			return true
-		}
-		// As per RFC 1812 Section 4.3.2.3
-		//
-		//   ICMP datagram SHOULD contain as much of the original
-		//   datagram as possible without the length of the ICMP
-		//   datagram exceeding 576 bytes
-		//
-		// NOTE: The above RFC referenced is different from the original
-		// recommendation in RFC 1122 where it mentioned that at least 8
-		// bytes of the payload must be included. Today linux and other
-		// systems implement the] RFC1812 definition and not the original
-		// RFC 1122 requirement.
-		mtu := int(r.MTU())
-		if mtu > header.IPv4MinimumProcessableDatagramSize {
-			mtu = header.IPv4MinimumProcessableDatagramSize
-		}
-		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
-		available := int(mtu) - headerLen
-		payloadLen := pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size() + pkt.Data.Size()
-		if payloadLen > available {
-			payloadLen = available
-		}
-
-		// The buffers used by pkt may be used elsewhere in the system.
-		// For example, a raw or packet socket may use what UDP
-		// considers an unreachable destination. Thus we deep copy pkt
-		// to prevent multiple ownership and SR errors.
-		newHeader := append(buffer.View(nil), pkt.NetworkHeader().View()...)
-		newHeader = append(newHeader, pkt.TransportHeader().View()...)
-		payload := newHeader.ToVectorisedView()
-		payload.AppendView(pkt.Data.ToView())
-		payload.CapLength(payloadLen)
-
-		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: headerLen,
-			Data:               payload,
-		})
-		icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
-		icmpHdr.SetType(header.ICMPv4DstUnreachable)
-		icmpHdr.SetCode(header.ICMPv4PortUnreachable)
-		icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
-
-	case header.IPv6AddressSize:
-		if !r.Stack().AllowICMPMessage() {
-			r.Stack().Stats().ICMP.V6PacketsSent.RateLimited.Increment()
-			return true
-		}
-
-		// As per RFC 4443 section 2.4
-		//
-		//    (c) Every ICMPv6 error message (type < 128) MUST include
-		//    as much of the IPv6 offending (invoking) packet (the
-		//    packet that caused the error) as possible without making
-		//    the error message packet exceed the minimum IPv6 MTU
-		//    [IPv6].
-		mtu := int(r.MTU())
-		if mtu > header.IPv6MinimumMTU {
-			mtu = header.IPv6MinimumMTU
-		}
-		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
-		available := int(mtu) - headerLen
-		network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
-		payloadLen := len(network) + len(transport) + pkt.Data.Size()
-		if payloadLen > available {
-			payloadLen = available
-		}
-		payload := buffer.NewVectorisedView(len(network)+len(transport), []buffer.View{network, transport})
-		payload.Append(pkt.Data)
-		payload.CapLength(payloadLen)
-
-		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
-			ReserveHeaderBytes: headerLen,
-			Data:               payload,
-		})
-		icmpHdr := header.ICMPv6(icmpPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
-		icmpHdr.SetType(header.ICMPv6DstUnreachable)
-		icmpHdr.SetCode(header.ICMPv6PortUnreachable)
-		icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, icmpPkt.Data))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
+	if !verifyChecksum(r, hdr, pkt) {
+		r.Stack().Stats().UDP.ChecksumErrors.Increment()
+		return stack.UnknownDestinationPacketMalformed
 	}
-	return true
+
+	return stack.UnknownDestinationPacketUnhandled
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+func (*protocol) Option(tcpip.GettableTransportProtocolOption) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -214,11 +111,10 @@ func (*protocol) Wait() {}
 
 // Parse implements stack.TransportProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
-	_, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
-	return ok
+	return parse.UDP(pkt)
 }
 
 // NewProtocol returns a UDP transport protocol.
-func NewProtocol() stack.TransportProtocol {
-	return &protocol{}
+func NewProtocol(s *stack.Stack) stack.TransportProtocol {
+	return &protocol{stack: s}
 }
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f87d99d5a..f9fbfa954 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -294,8 +294,8 @@ type testContext struct {
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
 	t.Helper()
 	return newDualTestContextWithOptions(t, mtu, stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 	})
 }
 
@@ -388,6 +388,10 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
 	}
 
+	if got, want := p.Pkt.TransportProtocolNumber, header.UDPProtocolNumber; got != want {
+		c.t.Errorf("got p.Pkt.TransportProtocolNumber = %d, want = %d", got, want)
+	}
+
 	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
 	b := vv.ToView()
 
@@ -403,18 +407,35 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 }
 
 // injectPacket creates a packet of the given flow and with the given payload,
-// and injects it into the link endpoint.
-func (c *testContext) injectPacket(flow testFlow, payload []byte) {
+// and injects it into the link endpoint. If badChecksum is true, the packet has
+// a bad checksum in the UDP header.
+func (c *testContext) injectPacket(flow testFlow, payload []byte, badChecksum bool) {
 	c.t.Helper()
 
 	h := flow.header4Tuple(incoming)
 	if flow.isV4() {
 		buf := c.buildV4Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field, taking care to avoid
+			// overflow to zero, which would disable checksum validation.
+			for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+				u.SetChecksum(u.Checksum() + 1)
+				if u.Checksum() != 0 {
+					break
+				}
+			}
+		}
 		c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
 	} else {
 		buf := c.buildV6Packet(payload, &h)
+		if badChecksum {
+			// Invalidate the UDP header checksum field (Unlike IPv4, zero is
+			// a valid checksum value for IPv6 so no need to avoid it).
+			u := header.UDP(buf[header.IPv6MinimumSize:])
+			u.SetChecksum(u.Checksum() + 1)
+		}
 		c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
 		}))
@@ -511,8 +532,8 @@ func newMinPayload(minSize int) []byte {
 
 func TestBindToDeviceOption(t *testing.T) {
 	s := stack.New(stack.Options{
-		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}})
 
 	ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
@@ -522,7 +543,7 @@ func TestBindToDeviceOption(t *testing.T) {
 
 	opts := stack.NICOptions{Name: "my_device"}
 	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
-		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %s", opts, err)
 	}
 
 	// nicIDPtr is used instead of taking the address of NICID literals, which is
@@ -546,16 +567,15 @@ func TestBindToDeviceOption(t *testing.T) {
 		t.Run(testAction.name, func(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
-				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				if gotErr, wantErr := ep.SetSockOpt(&bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("got SetSockOpt(&%T(%d)) = %s, want = %s", bindToDevice, bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %v, want %v", err, nil)
-			}
-			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
-				t.Errorf("bindToDevice got %d, want %d", got, want)
+				t.Errorf("GetSockOpt(&%T): %s", bindToDevice, err)
+			} else if bindToDevice != testAction.getBindToDevice {
+				t.Errorf("got bindToDevice = %d, want = %d", bindToDevice, testAction.getBindToDevice)
 			}
 		})
 	}
@@ -569,7 +589,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	c.t.Helper()
 
 	payload := newPayload()
-	c.injectPacket(flow, payload)
+	c.injectPacket(flow, payload, false)
 
 	// Try to receive the data.
 	we, ch := waiter.NewChannelEntry(nil)
@@ -611,12 +631,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	// Check the peer address.
 	h := flow.header4Tuple(incoming)
 	if addr.Addr != h.srcAddr.Addr {
-		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
+		c.t.Fatalf("got address = %s, want = %s", addr.Addr, h.srcAddr.Addr)
 	}
 
 	// Check the payload.
 	if !bytes.Equal(payload, v) {
-		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
+		c.t.Fatalf("got payload = %x, want = %x", v, payload)
 	}
 
 	// Run any checkers against the ControlMessages.
@@ -677,7 +697,7 @@ func TestBindReservedPort(t *testing.T) {
 		}
 		defer ep.Close()
 		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 	}
 
@@ -690,7 +710,7 @@ func TestBindReservedPort(t *testing.T) {
 		// We can't bind ipv4-any on the port reserved by the connected endpoint
 		// above, since the endpoint is dual-stack.
 		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}), tcpip.ErrPortInUse; got != want {
-			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+			t.Fatalf("got ep.Bind(...) = %s, want = %s", got, want)
 		}
 		// We can bind an ipv4 address on this port, though.
 		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
@@ -787,8 +807,8 @@ func TestV4ReadSelfSource(t *testing.T) {
 	} {
 		t.Run(tt.name, func(t *testing.T) {
 			c := newDualTestContextWithOptions(t, defaultMTU, stack.Options{
-				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 				HandleLocal:        tt.handleLocal,
 			})
 			defer c.cleanup()
@@ -813,7 +833,7 @@ func TestV4ReadSelfSource(t *testing.T) {
 			}
 
 			if _, _, err := c.ep.Read(nil); err != tt.wantErr {
-				t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr)
+				t.Errorf("got c.ep.Read(nil) = %s, want = %s", err, tt.wantErr)
 			}
 		})
 	}
@@ -854,8 +874,8 @@ func TestReadOnBoundToMulticast(t *testing.T) {
 
 			// Join multicast group.
 			ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: mcastAddr}
-			if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-				c.t.Fatal("SetSockOpt failed:", err)
+			if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+				c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 			}
 
 			// Check that we receive multicast packets but not unicast or broadcast
@@ -908,42 +928,6 @@ func TestReadFromMulticast(t *testing.T) {
 	}
 }
 
-// TestReadFromMulticaststats checks that a discarded packet
-// that that was sent with multicast SOURCE address increments
-// the correct counters and that a regular packet does not.
-func TestReadFromMulticastStats(t *testing.T) {
-	t.Helper()
-	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6, unicastV4} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
-
-			c.createEndpointForFlow(flow)
-
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				t.Fatalf("Bind failed: %s", err)
-			}
-
-			payload := newPayload()
-			c.injectPacket(flow, payload)
-
-			var want uint64 = 0
-			if flow.isReverseMulticast() {
-				want = 1
-			}
-			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != want {
-				t.Errorf("got stats.IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
-			}
-			if got := c.s.Stats().UDP.InvalidSourceAddress.Value(); got != want {
-				t.Errorf("got stats.UDP.InvalidSourceAddress.Value() = %d, want = %d", got, want)
-			}
-			if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
-				t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
-			}
-		})
-	}
-}
-
 // TestV4ReadBroadcastOnBoundToWildcard checks that an endpoint can bind to ANY
 // and receive broadcast and unicast data.
 func TestV4ReadBroadcastOnBoundToWildcard(t *testing.T) {
@@ -1386,8 +1370,8 @@ func TestReadIPPacketInfo(t *testing.T) {
 
 			if test.flow.isMulticast() {
 				ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()}
-				if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-					c.t.Fatalf("SetSockOpt(%+v): %s:", ifoptSet, err)
+				if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+					c.t.Fatalf("SetSockOpt(&%#v): %s:", ifoptSet, err)
 				}
 			}
 
@@ -1446,6 +1430,32 @@ func TestNoChecksum(t *testing.T) {
 	}
 }
 
+var _ stack.NetworkInterface = (*testInterface)(nil)
+
+type testInterface struct {
+	stack.NetworkLinkEndpoint
+}
+
+func (*testInterface) ID() tcpip.NICID {
+	return 0
+}
+
+func (*testInterface) IsLoopback() bool {
+	return false
+}
+
+func (*testInterface) Name() string {
+	return ""
+}
+
+func (*testInterface) Enabled() bool {
+	return true
+}
+
+func (*testInterface) WritePacketToRemote(tcpip.LinkAddress, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
 func TestTTL(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
@@ -1463,16 +1473,19 @@ func TestTTL(t *testing.T) {
 			if flow.isMulticast() {
 				wantTTL = multicastTTL
 			} else {
-				var p stack.NetworkProtocol
+				var p stack.NetworkProtocolFactory
+				var n tcpip.NetworkProtocolNumber
 				if flow.isV4() {
-					p = ipv4.NewProtocol()
+					p = ipv4.NewProtocol
+					n = ipv4.ProtocolNumber
 				} else {
-					p = ipv6.NewProtocol()
+					p = ipv6.NewProtocol
+					n = ipv6.ProtocolNumber
 				}
-				ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
-					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-				}))
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocolFactory{p},
+				})
+				ep := s.NetworkProtocolInstance(n).NewEndpoint(&testInterface{}, nil, nil, nil)
 				wantTTL = ep.DefaultTTL()
 				ep.Close()
 			}
@@ -1496,18 +1509,6 @@ func TestSetTTL(t *testing.T) {
 						c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 					}
 
-					var p stack.NetworkProtocol
-					if flow.isV4() {
-						p = ipv4.NewProtocol()
-					} else {
-						p = ipv6.NewProtocol()
-					}
-					ep := p.NewEndpoint(0, nil, nil, nil, stack.New(stack.Options{
-						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
-					}))
-					ep.Close()
-
 					testWrite(c, flow, checker.TTL(wantTTL))
 				})
 			}
@@ -1530,7 +1531,7 @@ func TestSetTOS(t *testing.T) {
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
 			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
@@ -1691,19 +1692,17 @@ func TestMulticastInterfaceOption(t *testing.T) {
 								}
 							}
 
-							if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-								c.t.Fatalf("SetSockOpt failed: %s", err)
+							if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+								c.t.Fatalf("SetSockOpt(&%#v): %s", ifoptSet, err)
 							}
 
 							// Verify multicast interface addr and NIC were set correctly.
 							// Note that NIC must be 1 since this is our outgoing interface.
-							ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}
 							var ifoptGot tcpip.MulticastInterfaceOption
 							if err := c.ep.GetSockOpt(&ifoptGot); err != nil {
-								c.t.Fatalf("GetSockOpt failed: %s", err)
-							}
-							if ifoptGot != ifoptWant {
-								c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant)
+								c.t.Fatalf("GetSockOpt(&%T): %s", ifoptGot, err)
+							} else if ifoptWant := (tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}); ifoptGot != ifoptWant {
+								c.t.Errorf("got multicast interface option = %#v, want = %#v", ifoptGot, ifoptWant)
 							}
 						})
 					}
@@ -1727,21 +1726,33 @@ func TestV4UnknownDestination(t *testing.T) {
 		// so that the final generated IPv4 packet is larger than
 		// header.IPv4MinimumProcessableDatagramSize.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV4, true, false},
-		{unicastV4, true, true},
-		{multicastV4, false, false},
-		{multicastV4, false, true},
-		{broadcast, false, false},
-		{broadcast, false, true},
-	}
+		{unicastV4, true, false, false},
+		{unicastV4, true, true, false},
+		{unicastV4, false, false, true},
+		{unicastV4, false, true, true},
+		{multicastV4, false, false, false},
+		{multicastV4, false, true, false},
+		{broadcast, false, false, false},
+		{broadcast, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(576)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1771,16 +1782,26 @@ func TestV4UnknownDestination(t *testing.T) {
 				checker.ICMPv4Type(header.ICMPv4DstUnreachable),
 				checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
 
+			// We need to compare the included data part of the UDP packet that is in
+			// the ICMP packet with the matching original data.
 			icmpPkt := header.ICMPv4(hdr.Payload())
 			payloadIPHeader := header.IPv4(icmpPkt.Payload())
+			incomingHeaderLength := header.IPv4MinimumSize + header.UDPMinimumSize
 			wantLen := len(payload)
 			if tc.largePayload {
-				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize*2 - header.ICMPv4MinimumSize - header.UDPMinimumSize
+				// To work out the data size we need to simulate what the sender would
+				// have done. The wanted size is the total available minus the sum of
+				// the headers in the UDP AND ICMP packets, given that we know the test
+				// had only a minimal IP header but the ICMP sender will have allowed
+				// for a maximally sized packet header.
+				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength
+
 			}
 
-			// In case of large payloads the IP packet may be truncated. Update
+			// In the case of large payloads the IP packet may be truncated. Update
 			// the length field before retrieving the udp datagram payload.
-			payloadIPHeader.SetTotalLength(uint16(wantLen + header.UDPMinimumSize + header.IPv4MinimumSize))
+			// Add back the two headers within the payload.
+			payloadIPHeader.SetTotalLength(uint16(wantLen + incomingHeaderLength))
 
 			origDgram := header.UDP(payloadIPHeader.Payload())
 			if got, want := len(origDgram.Payload()), wantLen; got != want {
@@ -1806,19 +1827,31 @@ func TestV6UnknownDestination(t *testing.T) {
 		// largePayload if true will result in a payload large enough to
 		// create an IPv6 packet > header.IPv6MinimumMTU bytes.
 		largePayload bool
+		// badChecksum if true, will set an invalid checksum in the
+		// header.
+		badChecksum bool
 	}{
-		{unicastV6, true, false},
-		{unicastV6, true, true},
-		{multicastV6, false, false},
-		{multicastV6, false, true},
-	}
+		{unicastV6, true, false, false},
+		{unicastV6, true, true, false},
+		{unicastV6, false, false, true},
+		{unicastV6, false, true, true},
+		{multicastV6, false, false, false},
+		{multicastV6, false, true, false},
+	}
+	checksumErrors := uint64(0)
 	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t badChecksum:%t", tc.flow, tc.icmpRequired, tc.largePayload, tc.badChecksum), func(t *testing.T) {
 			payload := newPayload()
 			if tc.largePayload {
 				payload = newMinPayload(1280)
 			}
-			c.injectPacket(tc.flow, payload)
+			c.injectPacket(tc.flow, payload, tc.badChecksum)
+			if tc.badChecksum {
+				checksumErrors++
+				if got, want := c.s.Stats().UDP.ChecksumErrors.Value(), checksumErrors; got != want {
+					t.Fatalf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+				}
+			}
 			if !tc.icmpRequired {
 				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 				defer cancel()
@@ -1953,74 +1986,29 @@ func TestShortHeader(t *testing.T) {
 	}
 }
 
-// TestIncrementChecksumErrorsV4 verifies if a checksum error is detected,
+// TestBadChecksumErrors verifies if a checksum error is detected,
 // global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV4(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
-
-	c.createEndpoint(ipv4.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
-
-	payload := newPayload()
-	h := unicastV4.header4Tuple(incoming)
-	buf := c.buildV4Packet(payload, &h)
+func TestBadChecksumErrors(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV6} {
+		c := newDualTestContext(t, defaultMTU)
+		defer c.cleanup()
 
-	// Invalidate the UDP header checksum field, taking care to avoid
-	// overflow to zero, which would disable checksum validation.
-	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
-		u.SetChecksum(u.Checksum() + 1)
-		if u.Checksum() != 0 {
-			break
+		c.createEndpoint(flow.sockProto())
+		// Bind to wildcard.
+		if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+			c.t.Fatalf("Bind failed: %s", err)
 		}
-	}
-
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
-	}
-}
 
-// TestIncrementChecksumErrorsV6 verifies if a checksum error is detected,
-// global and endpoint stats are incremented.
-func TestIncrementChecksumErrorsV6(t *testing.T) {
-	c := newDualTestContext(t, defaultMTU)
-	defer c.cleanup()
+		payload := newPayload()
+		c.injectPacket(flow, payload, true /* badChecksum */)
 
-	c.createEndpoint(ipv6.ProtocolNumber)
-	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %s", err)
-	}
-
-	payload := newPayload()
-	h := unicastV6.header4Tuple(incoming)
-	buf := c.buildV6Packet(payload, &h)
-
-	// Invalidate the UDP header checksum field.
-	u := header.UDP(buf[header.IPv6MinimumSize:])
-	u.SetChecksum(u.Checksum() + 1)
-
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
-		Data: buf.ToVectorisedView(),
-	}))
-
-	const want = 1
-	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
-		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
-	}
-	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		const want = 1
+		if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+			t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+		}
+		if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+			t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+		}
 	}
 }
 
@@ -2039,7 +2027,8 @@ func TestPayloadModifiedV4(t *testing.T) {
 	payload := newPayload()
 	h := unicastV4.header4Tuple(incoming)
 	buf := c.buildV4Packet(payload, &h)
-	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	// Modify the payload so that the checksum value in the UDP header will be
+	// incorrect.
 	buf[len(buf)-1]++
 	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
@@ -2069,7 +2058,8 @@ func TestPayloadModifiedV6(t *testing.T) {
 	payload := newPayload()
 	h := unicastV6.header4Tuple(incoming)
 	buf := c.buildV6Packet(payload, &h)
-	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	// Modify the payload so that the checksum value in the UDP header will be
+	// incorrect.
 	buf[len(buf)-1]++
 	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
@@ -2350,17 +2340,18 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 					NIC:         nicID1,
 				},
 			},
-			remoteAddr:           remNetSubnetBcast,
-			requiresBroadcastOpt: true,
+			remoteAddr: remNetSubnetBcast,
+			// TODO(gvisor.dev/issue/3938): Once we support marking a route as
+			// broadcast, this test should require the broadcast option to be set.
+			requiresBroadcastOpt: false,
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
-				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+				TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
 			})
 			e := channel.New(0, defaultMTU, "")
 			if err := s.CreateNIC(nicID1, e); err != nil {
diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
index 052b6b99d..64d17f661 100644
--- a/pkg/test/dockerutil/container.go
+++ b/pkg/test/dockerutil/container.go
@@ -22,6 +22,7 @@ import (
 	"net"
 	"os"
 	"path"
+	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -403,10 +404,13 @@ func (c *Container) CopyFiles(opts *RunOpts, target string, sources ...string) {
 		return
 	}
 	for _, name := range sources {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
-			return
+		src := name
+		if !filepath.IsAbs(src) {
+			src, err = testutil.FindFile(name)
+			if err != nil {
+				c.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %w", name, err)
+				return
+			}
 		}
 		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 952871f95..7027df1a5 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -60,7 +60,6 @@ var (
 	// enabled for each run.
 	pprofBlock = flag.Bool("pprof-block", false, "enables block profiling with runsc debug")
 	pprofCPU   = flag.Bool("pprof-cpu", false, "enables CPU profiling with runsc debug")
-	pprofGo    = flag.Bool("pprof-go", false, "enables goroutine profiling with runsc debug")
 	pprofHeap  = flag.Bool("pprof-heap", false, "enables heap profiling with runsc debug")
 	pprofMutex = flag.Bool("pprof-mutex", false, "enables mutex profiling with runsc debug")
 )
diff --git a/pkg/test/dockerutil/profile.go b/pkg/test/dockerutil/profile.go
index f0396ef24..55f9496cd 100644
--- a/pkg/test/dockerutil/profile.go
+++ b/pkg/test/dockerutil/profile.go
@@ -63,7 +63,7 @@ type Pprof struct {
 
 // MakePprofFromFlags makes a Pprof profile from flags.
 func MakePprofFromFlags(c *Container) *Pprof {
-	if !(*pprofBlock || *pprofCPU || *pprofGo || *pprofHeap || *pprofMutex) {
+	if !(*pprofBlock || *pprofCPU || *pprofHeap || *pprofMutex) {
 		return nil
 	}
 	return &Pprof{
diff --git a/pkg/test/testutil/BUILD b/pkg/test/testutil/BUILD
index 2d8f56bc0..c4b131896 100644
--- a/pkg/test/testutil/BUILD
+++ b/pkg/test/testutil/BUILD
@@ -12,7 +12,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 1580527b5..49ab87c58 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -44,7 +44,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -133,25 +133,28 @@ func Command(logger Logger, args ...string) *Cmd {
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig(t *testing.T) *boot.Config {
+func TestConfig(t *testing.T) *config.Config {
 	logDir := os.TempDir()
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
-	return &boot.Config{
-		Debug:              true,
-		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
-		LogFormat:          "text",
-		DebugLogFormat:     "text",
-		LogPackets:         true,
-		Network:            boot.NetworkNone,
-		Strace:             true,
-		Platform:           "ptrace",
-		FileAccess:         boot.FileAccessExclusive,
-		NumNetworkChannels: 1,
 
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+	// Only register flags if config is being used. Otherwise anyone that uses
+	// testutil will get flags registered and they may conflict.
+	config.RegisterFlags()
+
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.Debug = true
+	conf.DebugLog = path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%")
+	conf.LogPackets = true
+	conf.Network = config.NetworkNone
+	conf.Strace = true
+	conf.TestOnlyAllowRunAsCurrentUserWithoutChroot = true
+	return conf
 }
 
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
@@ -203,7 +206,7 @@ func SetupRootDir() (string, func(), error) {
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) {
+func SetupContainer(spec *specs.Spec, conf *config.Config) (rootDir, bundleDir string, cleanup func(), err error) {
 	rootDir, rootCleanup, err := SetupRootDir()
 	if err != nil {
 		return "", "", nil, err
@@ -243,12 +246,15 @@ func writeSpec(dir string, spec *specs.Spec) error {
 	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
 }
 
+// idRandomSrc is a pseudo random generator used to in RandomID.
+var idRandomSrc = rand.New(rand.NewSource(time.Now().UnixNano()))
+
 // RandomID returns 20 random bytes following the given prefix.
 func RandomID(prefix string) string {
 	// Read 20 random bytes.
 	b := make([]byte, 20)
 	// "[Read] always returns len(p) and a nil error." --godoc
-	if _, err := rand.Read(b); err != nil {
+	if _, err := idRandomSrc.Read(b); err != nil {
 		panic("rand.Read failed: " + err.Error())
 	}
 	if prefix != "" {
@@ -264,7 +270,7 @@ func RandomID(prefix string) string {
 // same name, sometimes between test runs the socket does not get cleaned up
 // quickly enough, causing container creation to fail.
 func RandomContainerID() string {
-	return RandomID("test-container-")
+	return RandomID("test-container")
 }
 
 // Copy copies file from src to dst.
@@ -326,13 +332,13 @@ func PollContext(ctx context.Context, cb func() error) error {
 }
 
 // WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
-func WaitForHTTP(port int, timeout time.Duration) error {
+func WaitForHTTP(ip string, port int, timeout time.Duration) error {
 	cb := func() error {
 		c := &http.Client{
 			// Calculate timeout to be able to do minimum 5 attempts.
 			Timeout: timeout / 5,
 		}
-		url := fmt.Sprintf("http://localhost:%d/", port)
+		url := fmt.Sprintf("http://%s:%d/", ip, port)
 		resp, err := c.Get(url)
 		if err != nil {
 			log.Printf("Waiting %s: %v", url, err)
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index d843f19cf..c976d7230 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -522,7 +522,7 @@ func (s *ServerSocket) Listen() error {
 // This is always blocking.
 //
 // Preconditions:
-//  * ServerSocket is listening (Listen called).
+// * ServerSocket is listening (Listen called).
 func (s *ServerSocket) Accept() (*Socket, error) {
 	fd, ok := s.socket.enterFD()
 	if !ok {
diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go
index c09337c15..495896ded 100644
--- a/pkg/usermem/addr_range_seq_unsafe.go
+++ b/pkg/usermem/addr_range_seq_unsafe.go
@@ -81,8 +81,10 @@ func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
 	return addrRangeSeqFromSliceLimited(slice, limit)
 }
 
-// Preconditions: The combined length of all AddrRanges in slice <= limit.
-// limit >= 0. If len(slice) != 0, then limit > 0.
+// Preconditions:
+// * The combined length of all AddrRanges in slice <= limit.
+// * limit >= 0.
+// * If len(slice) != 0, then limit > 0.
 func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
 	switch len(slice) {
 	case 0:
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index cd6a0ea6b..9b1e7a085 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -21,7 +21,6 @@ import (
 	"io"
 	"strconv"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/safemem"
@@ -54,8 +53,10 @@ type IO interface {
 	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
 	// non-nil error explaining why.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. toZero >= 0.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * toZero >= 0.
 	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
 
 	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
@@ -66,9 +67,11 @@ type IO interface {
 	//
 	// CopyOutFrom calls src.ReadToBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. src.ReadToBlocks must not block
-	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * src.ReadToBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
 
 	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
@@ -78,10 +81,11 @@ type IO interface {
 	//
 	// CopyInTo calls dst.WriteFromBlocks at most once.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. dst.WriteFromBlocks must not
-	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
-	// order.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * dst.WriteFromBlocks must not block on mm.MemoryManager.activeMu or
+	//   any preceding locks in the lock order.
 	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
 
 	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
@@ -93,25 +97,28 @@ type IO interface {
 	// SwapUint32 atomically sets the uint32 value at addr to new and
 	// returns the previous value.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
 
 	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
 	// old; if they are equal, the value in memory is replaced by new. In
 	// either case, the previous value stored in memory is returned.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
 
 	// LoadUint32 atomically loads the uint32 value at addr and returns it.
 	//
-	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
-	// any following locks in the lock order. addr must be aligned to a 4-byte
-	// boundary.
+	// Preconditions:
+	// * The caller must not hold mm.MemoryManager.mappingMu or any
+	//   following locks in the lock order.
+	// * addr must be aligned to a 4-byte boundary.
 	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }
 
@@ -176,51 +183,6 @@ func (rw *IOReadWriter) Write(src []byte) (int, error) {
 	return n, err
 }
 
-// CopyObjectOut copies a fixed-size value or slice of fixed-size values from
-// src to the memory mapped at addr in uio. It returns the number of bytes
-// copied.
-//
-// CopyObjectOut must use reflection to encode src; performance-sensitive
-// clients should do encoding manually and use uio.CopyOut directly.
-//
-// Preconditions: As for IO.CopyOut.
-func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
-	w := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	// Allocate a byte slice the size of the object being marshaled. This
-	// adds an extra reflection call, but avoids needing to grow the slice
-	// during encoding, which can result in many heap-allocated slices.
-	b := make([]byte, 0, binary.Size(src))
-	return w.Write(binary.Marshal(b, ByteOrder, src))
-}
-
-// CopyObjectIn copies a fixed-size value or slice of fixed-size values from
-// the memory mapped at addr in uio to dst. It returns the number of bytes
-// copied.
-//
-// CopyObjectIn must use reflection to decode dst; performance-sensitive
-// clients should use uio.CopyIn directly and do decoding manually.
-//
-// Preconditions: As for IO.CopyIn.
-func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
-	r := &IOReadWriter{
-		Ctx:  ctx,
-		IO:   uio,
-		Addr: addr,
-		Opts: opts,
-	}
-	buf := make([]byte, binary.Size(dst))
-	if _, err := io.ReadFull(r, buf); err != nil {
-		return 0, err
-	}
-	binary.Unmarshal(buf, ByteOrder, dst)
-	return int(r.Addr - addr), nil
-}
-
 // CopyStringIn tuning parameters, defined outside that function for tests.
 const (
 	copyStringIncrement     = 64
@@ -233,7 +195,8 @@ const (
 // would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
 // ENAMETOOLONG.
 //
-// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
+// Preconditions: Same as IO.CopyFromUser, plus:
+// * maxlen >= 0.
 func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
 	initLen := maxlen
 	if initLen > copyStringMaxInitBufLen {
@@ -287,7 +250,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 // less. CopyOutVec returns the number of bytes copied; if this is less than
 // the maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyOut.
+// Preconditions: Same as IO.CopyOut.
 func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(src) {
@@ -311,7 +274,7 @@ func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts
 // less. CopyInVec returns the number of bytes copied; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.CopyIn.
+// Preconditions: Same as IO.CopyIn.
 func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
 	var done int
 	for !ars.IsEmpty() && done < len(dst) {
@@ -335,7 +298,7 @@ func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts I
 // ZeroOutVec returns the number of bytes written; if this is less than the
 // maximum, it returns a non-nil error explaining why.
 //
-// Preconditions: As for IO.ZeroOut.
+// Preconditions: Same as IO.ZeroOut.
 func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
 	var done int64
 	for !ars.IsEmpty() && done < toZero {
@@ -388,7 +351,7 @@ func isASCIIWhitespace(b byte) bool {
 //
 // - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
 	if len(dsts) == 0 {
 		return 0, nil
@@ -481,28 +444,28 @@ func (s IOSequence) NumBytes() int64 {
 
 // DropFirst returns a copy of s with s.Addrs.DropFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst.
+// Preconditions: Same as AddrRangeSeq.DropFirst.
 func (s IOSequence) DropFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
 }
 
 // DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.DropFirst64.
+// Preconditions: Same as AddrRangeSeq.DropFirst64.
 func (s IOSequence) DropFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
 }
 
 // TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst.
+// Preconditions: Same as AddrRangeSeq.TakeFirst.
 func (s IOSequence) TakeFirst(n int) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
 }
 
 // TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
 //
-// Preconditions: As for AddrRangeSeq.TakeFirst64.
+// Preconditions: Same as AddrRangeSeq.TakeFirst64.
 func (s IOSequence) TakeFirst64(n int64) IOSequence {
 	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
 }
@@ -512,7 +475,7 @@ func (s IOSequence) TakeFirst64(n int64) IOSequence {
 // As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyOutVec.
+// Preconditions: Same as CopyOutVec.
 func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
 }
@@ -522,7 +485,7 @@ func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
 // As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
 // s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for CopyInVec.
+// Preconditions: Same as CopyInVec.
 func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
 }
@@ -532,21 +495,21 @@ func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
 // As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
 // to s.NumBytes(), and a nil error will be returned.
 //
-// Preconditions: As for ZeroOutVec.
+// Preconditions: Same as ZeroOutVec.
 func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
 	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
 }
 
 // CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
 //
-// Preconditions: As for IO.CopyOutFrom.
+// Preconditions: Same as IO.CopyOutFrom.
 func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
 	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
 }
 
 // CopyInTo invokes s.CopyInTo over s.Addrs.
 //
-// Preconditions: As for IO.CopyInTo.
+// Preconditions: Same as IO.CopyInTo.
 func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
 	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
 }
diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go
index bf3c5df2b..da60b0cc7 100644
--- a/pkg/usermem/usermem_test.go
+++ b/pkg/usermem/usermem_test.go
@@ -16,7 +16,6 @@ package usermem
 
 import (
 	"bytes"
-	"encoding/binary"
 	"fmt"
 	"reflect"
 	"strings"
@@ -174,23 +173,6 @@ type testStruct struct {
 	Uint64 uint64
 }
 
-func TestCopyObject(t *testing.T) {
-	wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8}
-	wantN := binary.Size(wantObj)
-	b := &BytesIO{make([]byte, wantN)}
-	ctx := newContext()
-	if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil {
-		t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	var gotObj testStruct
-	if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil {
-		t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
-	}
-	if gotObj != wantObj {
-		t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj)
-	}
-}
-
 func TestCopyStringInShort(t *testing.T) {
 	// Tests for string length <= copyStringIncrement.
 	want := strings.Repeat("A", copyStringIncrement-2)
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 67a950444..08519d986 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -168,7 +168,7 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
 //
 // +stateify savable
 type Queue struct {
-	list waiterList   `state:"zerovalue"`
+	list waiterList
 	mu   sync.RWMutex `state:"nosave"`
 }
 
diff --git a/runsc/BUILD b/runsc/BUILD
index 96f697a5f..3b91b984a 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -13,16 +13,7 @@ go_binary(
         "//visibility:public",
     ],
     x_defs = {"main.version": "{STABLE_VERSION}"},
-    deps = [
-        "//pkg/log",
-        "//pkg/refs",
-        "//pkg/sentry/platform",
-        "//runsc/boot",
-        "//runsc/cmd",
-        "//runsc/flag",
-        "//runsc/specutils",
-        "@com_github_google_subcommands//:go_default_library",
-    ],
+    deps = ["//runsc/cli"],
 )
 
 # The runsc-race target is a race-compatible BUILD target. This must be built
@@ -49,72 +40,7 @@ go_binary(
         "//visibility:public",
     ],
     x_defs = {"main.version": "{STABLE_VERSION}"},
-    deps = [
-        "//pkg/log",
-        "//pkg/refs",
-        "//pkg/sentry/platform",
-        "//runsc/boot",
-        "//runsc/cmd",
-        "//runsc/flag",
-        "//runsc/specutils",
-        "@com_github_google_subcommands//:go_default_library",
-    ],
-)
-
-pkg_tar(
-    name = "debian-bin",
-    srcs = [
-        ":runsc",
-        "//shim/v1:gvisor-containerd-shim",
-        "//shim/v2:containerd-shim-runsc-v1",
-    ],
-    mode = "0755",
-    package_dir = "/usr/bin",
-)
-
-pkg_tar(
-    name = "debian-data",
-    extension = "tar.gz",
-    deps = [
-        ":debian-bin",
-        "//shim:config",
-    ],
-)
-
-genrule(
-    name = "deb-version",
-    # Note that runsc must appear in the srcs parameter and not the tools
-    # parameter, otherwise it will not be stamped. This is reasonable, as tools
-    # may be encoded differently in the build graph (cached more aggressively
-    # because they are assumes to be hermetic).
-    srcs = [":runsc"],
-    outs = ["version.txt"],
-    # Note that the little dance here is necessary because files in the $(SRCS)
-    # attribute are not executable by default, and we can't touch in place.
-    cmd = "cp $(location :runsc) $(@D)/runsc && \
-        chmod a+x $(@D)/runsc && \
-        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
-        rm -f $(@D)/runsc",
-    stamp = 1,
-)
-
-pkg_deb(
-    name = "runsc-debian",
-    architecture = "amd64",
-    data = ":debian-data",
-    # Note that the description_file will be flatten (all newlines removed),
-    # and therefore it is kept to a simple one-line description. The expected
-    # format for debian packages is "short summary\nLonger explanation of
-    # tool." and this is impossible with the flattening.
-    description_file = "debian/description",
-    homepage = "https://gvisor.dev/",
-    maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
-    package = "runsc",
-    postinst = "debian/postinst.sh",
-    version_file = ":version.txt",
-    visibility = [
-        "//visibility:public",
-    ],
+    deps = ["//runsc/cli"],
 )
 
 sh_test(
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 9f52438c2..b97dc3c47 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
         "compat.go",
         "compat_amd64.go",
         "compat_arm64.go",
-        "config.go",
         "controller.go",
         "debug.go",
         "events.go",
@@ -27,15 +26,19 @@ go_library(
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
@@ -105,9 +108,11 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
+        "//runsc/config",
         "//runsc/specutils",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "//runsc/specutils/seccomp",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -123,6 +128,7 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
@@ -131,6 +137,7 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
+        "//runsc/config",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 84c67cbc2..7076ae2e2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -19,7 +19,7 @@ import (
 	"os"
 	"syscall"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 626a3816e..4e0f0d57a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -29,10 +30,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/state"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -220,7 +223,7 @@ type StartArgs struct {
 	Spec *specs.Spec
 
 	// Config is the runsc-specific configuration for the sandbox.
-	Conf *Config
+	Conf *config.Config
 
 	// CID is the ID of the container to start.
 	CID string
@@ -256,13 +259,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	fds, err := fd.NewFromFiles(args.FilePayload.Files)
 	if err != nil {
+		return err
+	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
-
 	return nil
 }
 
@@ -358,12 +368,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
+	ctx := k.SupervisorContext()
 	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
-	renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
-	if err != nil {
-		return fmt.Errorf("creating RestoreEnvironment: %v", err)
+	if kernel.VFS2Enabled {
+		ctx, err = mntr.configureRestore(ctx, cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("configuring filesystem restore: %v", err)
+		}
+	} else {
+		renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("creating RestoreEnvironment: %v", err)
+		}
+		fs.SetRestoreEnvironment(*renv)
 	}
-	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
 	if eps, ok := networkStack.(*netstack.Stack); ok {
@@ -390,7 +408,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
-	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+	if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
 		return err
 	}
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 149eb0b1b..a7c4ebb0c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE: {},
-	syscall.SYS_DUP:   {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
 	syscall.SYS_DUP3: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.O_CLOEXEC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FCHMOD:     {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
 		},
 		// Non-private variants are included for flipcall support. They are otherwise
 		// unncessary, as the sentry will use only private futexes internally.
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETPID: {},
 	unix.SYS_GETRANDOM: {},
 	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_DOMAIN),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_DOMAIN),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_TYPE),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_TYPE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_ERROR),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_ERROR),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_SNDBUF),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_SNDBUF),
 		},
 	},
 	syscall.SYS_GETTID:       {},
@@ -141,38 +130,44 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// setting/getting termios and winsize.
 	syscall.SYS_IOCTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCGETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCGETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSF),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSF),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSW),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSW),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCSWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCSWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCGWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCGWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 	},
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
+	unix.SYS_MEMBARRIER: []seccomp.Rule{
+		{
+			seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
+			seccomp.EqualTo(0),
+		},
+	},
 	syscall.SYS_MINCORE: {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
@@ -182,46 +177,46 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:  {},
@@ -237,32 +232,32 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RECVMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	unix.SYS_SENDMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_RESTART_SYSCALL: {},
@@ -272,49 +267,49 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SCHED_YIELD:     {},
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
 		// Used by fs/host to shutdown host sockets.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
 		// Used by unet to shutdown connections.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
 	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TEE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(1),                      /* len */
-			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(1),                      /* len */
+			seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
 		},
 	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UTIMENSAT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* null pathname */
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* flags */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* null pathname */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* flags */
 		},
 	},
 	syscall.SYS_WRITE: {},
 	// For rawfile.NonBlockingWriteIovec.
 	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 			seccomp.GreaterThan(0),
 		},
 	},
@@ -325,10 +320,10 @@ func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT4: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
 			},
 		},
 		syscall.SYS_BIND:        {},
@@ -337,84 +332,84 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_ERROR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_ERROR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_KEEPALIVE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_TYPE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_TYPE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_LINGER),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_LINGER),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_INFO),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_INFO),
 			},
 		},
 		syscall.SYS_IOCTL: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCOUTQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCOUTQ),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCINQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCINQ),
 			},
 		},
 		syscall.SYS_LISTEN:   {},
@@ -425,103 +420,103 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_SENDTO:   {},
 		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RD),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RD),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_WR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_WR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RDWR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RDWR),
 			},
 		},
 		syscall.SYS_SOCKET: []seccomp.Rule{
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 		},
 		syscall.SYS_WRITEV: {},
@@ -532,20 +527,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
+				seccomp.EqualTo(fd),
 			},
 		},
 		syscall.SYS_LISTEN: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
-				seccomp.AllowValue(16 /* unet.backlog */),
+				seccomp.EqualTo(fd),
+				seccomp.EqualTo(16 /* unet.backlog */),
 			},
 		},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_PEERCRED),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_PEERCRED),
 			},
 		},
 	}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
 )
 
 func init() {
-	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	)
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
 
 package filter
 
-// Reserve for future customization.
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_OPENAT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
 			},
 		},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index a30fa198e..6b6ae98d7 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -34,6 +34,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -48,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -66,7 +68,7 @@ const (
 // tmpfs has some extra supported options that we must pass through.
 var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
 	upperFlags.ReadOnly = false
@@ -163,7 +165,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 }
 
 // p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -174,7 +176,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 		// enablement.
 		opts = append(opts, "privateunixsocket=true")
 	}
-	if fa == FileAccessShared {
+	if fa == config.FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -259,7 +261,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -269,7 +271,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -288,7 +290,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	if conf.VFS2 {
 		return setupContainerVFS2(ctx, conf, mntr, procArgs)
 	}
@@ -326,14 +328,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
 }
 
 type fdDispenser struct {
-	fds []int
+	fds []*fd.FD
 }
 
 func (f *fdDispenser) remove() int {
 	if f.empty() {
 		panic("fdDispenser out of fds")
 	}
-	rv := f.fds[0]
+	rv := f.fds[0].Release()
 	f.fds = f.fds[1:]
 	return rv
 }
@@ -459,27 +461,27 @@ func (m *mountHint) isSupported() bool {
 func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
 	masterOpts := filterUnsupportedOptions(m.mount)
-	slaveOpts := filterUnsupportedOptions(mount)
+	replicaOpts := filterUnsupportedOptions(mount)
 
-	if len(masterOpts) != len(slaveOpts) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	if len(masterOpts) != len(replicaOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 	}
 
 	sort.Strings(masterOpts)
-	sort.Strings(slaveOpts)
+	sort.Strings(replicaOpts)
 	for i, opt := range masterOpts {
-		if opt != slaveOpts[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		if opt != replicaOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 		}
 	}
 	return nil
 }
 
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
 	if m.share == container {
-		return FileAccessExclusive
+		return config.FileAccessExclusive
 	}
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -570,7 +572,7 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
 		root:   spec.Root,
 		mounts: compileMounts(spec),
@@ -583,7 +585,7 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
 		return c.processHintsVFS2(conf, creds)
 	}
@@ -607,7 +609,7 @@ func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) e
 // setupFS is used to set up the file system for all containers. This is the
 // main entry point method, with most of the other being internal only. It
 // returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
 	log.Infof("Configuring container's file system")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -633,7 +635,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
 	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
 		return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -645,7 +647,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
 	defer root.DecRef(ctx)
 
@@ -681,7 +683,7 @@ func (c *containerMounter) checkDispenser() error {
 
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -721,7 +723,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
 }
 
 // createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
 
@@ -746,7 +748,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -766,7 +768,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
 	var (
 		fsName     string
 		opts       []string
@@ -800,19 +802,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
 	// Non-root bind mounts are always shared if no hints were provided.
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -856,7 +858,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
@@ -911,7 +913,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -936,7 +938,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
 
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
 // the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
@@ -991,7 +993,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
 	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
 	for _, tst := range []struct {
 		name        string
 		annotations map[string]string
-		want        FileAccessType
+		want        config.FileAccessType
 	}{
 		{
 			name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessExclusive,
+			want: config.FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "pod",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "shared",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 40c6f99fd..8c6ab213d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,12 +27,15 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -67,7 +70,9 @@ import (
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/specutils/seccomp"
 
 	// Include supported socket providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -79,7 +84,7 @@ import (
 )
 
 type containerInfo struct {
-	conf *Config
+	conf *config.Config
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -88,10 +93,10 @@ type containerInfo struct {
 	procArgs kernel.CreateProcessArgs
 
 	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
+	stdioFDs []*fd.FD
 
 	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
+	goferFDs []*fd.FD
 }
 
 // Loader keeps state needed to start the kernel and run the container..
@@ -165,7 +170,7 @@ type Args struct {
 	// Spec is the sandbox specification.
 	Spec *specs.Spec
 	// Conf is the system configuration.
-	Conf *Config
+	Conf *config.Config
 	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
 	// of this FD and may close it at any time.
 	ControllerFD int
@@ -278,6 +283,7 @@ func New(args Args) (*Loader, error) {
 		args.NumCPU = runtime.NumCPU()
 	}
 	log.Infof("CPUs: %d", args.NumCPU)
+	runtime.GOMAXPROCS(args.NumCPU)
 
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
@@ -355,12 +361,17 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
+	info := containerInfo{
+		conf:     args.Conf,
+		spec:     args.Spec,
+		procArgs: procArgs,
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-	var stdioFDs []int
 	newfd := startingStdioFD
-	for _, fd := range args.StdioFDs {
+	for _, stdioFD := range args.StdioFDs {
 		// Check that newfd is unused to avoid clobbering over it.
 		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
 			if err != nil {
@@ -369,14 +380,17 @@ func New(args Args) (*Loader, error) {
 			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
 		}
 
-		err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
 		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
 		}
-		stdioFDs = append(stdioFDs, newfd)
-		_ = unix.Close(fd)
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
 		newfd++
 	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
 
 	eid := execID{cid: args.ID}
 	l := &Loader{
@@ -385,13 +399,7 @@ func New(args Args) (*Loader, error) {
 		sandboxID:  args.ID,
 		processes:  map[execID]*execProcess{eid: {}},
 		mountHints: mountHints,
-		root: containerInfo{
-			conf:     args.Conf,
-			stdioFDs: stdioFDs,
-			goferFDs: args.GoferFDs,
-			spec:     args.Spec,
-			procArgs: procArgs,
-		},
+		root:       info,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -465,13 +473,28 @@ func (l *Loader) Destroy() {
 	}
 	l.watchdog.Stop()
 
-	for i, fd := range l.root.stdioFDs {
-		_ = unix.Close(fd)
-		l.root.stdioFDs[i] = -1
+	// Release all kernel resources. This is only safe after we can no longer
+	// save/restore.
+	l.k.Release()
+
+	// All sentry-created resources should have been released at this point;
+	// check for reference leaks.
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.DoLeakCheck()
+	}
+
+	// In the success case, stdioFDs and goferFDs will only contain
+	// released/closed FDs that ownership has been passed over to host FDs and
+	// gofer sessions. Close them here in case of failure.
+	for _, fd := range l.root.stdioFDs {
+		_ = fd.Close()
+	}
+	for _, fd := range l.root.goferFDs {
+		_ = fd.Close()
 	}
 }
 
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
 	p, err := platform.Lookup(conf.Platform)
 	if err != nil {
 		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -498,13 +521,14 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
+// installSeccompFilters installs sandbox seccomp filters with the host.
 func (l *Loader) installSeccompFilters() error {
 	if l.root.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.root.conf.Network == NetworkHost,
+			HostNetwork:   l.root.conf.Network == config.NetworkHost,
 			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
@@ -531,7 +555,7 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.root.conf.Network == NetworkHost {
+	if l.root.conf.Network == config.NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
@@ -568,6 +592,7 @@ func (l *Loader) run() error {
 		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
 			return err
 		}
+
 	}
 
 	ep.tg = l.k.GlobalInit()
@@ -597,17 +622,6 @@ func (l *Loader) run() error {
 		}
 	})
 
-	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
-	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
-	// passed FDs, so only close for VFS1.
-	if !kernel.VFS2Enabled {
-		for i, fd := range l.root.stdioFDs {
-			_ = unix.Close(fd)
-			l.root.stdioFDs[i] = -1
-		}
-	}
-
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -627,9 +641,9 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -680,28 +694,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	}
 
 	info := &containerInfo{
-		conf: conf,
-		spec: spec,
+		conf:     conf,
+		spec:     spec,
+		stdioFDs: files[:3],
+		goferFDs: files[3:],
 	}
 	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
-
-	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
-	for _, f := range files[:3] {
-		info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
-	}
-
-	// Can't take ownership away from os.File. dup them to get a new FDs.
-	for _, f := range files[3:] {
-		fd, err := unix.Dup(int(f.Fd()))
-		if err != nil {
-			return fmt.Errorf("failed to dup file: %v", err)
-		}
-		info.goferFDs = append(info.goferFDs, fd)
-	}
-
 	tg, err := l.createContainerProcess(false, cid, info, ep)
 	if err != nil {
 		return err
@@ -743,7 +744,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		return nil, err
 	}
 
-	// Add the HOME enviroment variable if it is not already set.
+	// Add the HOME environment variable if it is not already set.
 	var envv []string
 	if kernel.VFS2Enabled {
 		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
@@ -779,19 +780,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		}
 	}
 
+	// Install seccomp filters with the new task if there are any.
+	if info.conf.OCISeccomp {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+			if err != nil {
+				return nil, fmt.Errorf("building seccomp program: %v", err)
+			}
+
+			if log.IsLogging(log.Debug) {
+				out, _ := bpf.DecodeProgram(program)
+				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+			}
+
+			task := tg.Leader()
+			// NOTE: It seems Flags are ignored by runc so we ignore them too.
+			if err := task.AppendSyscallFilter(program, true); err != nil {
+				return nil, fmt.Errorf("appending seccomp filters: %v", err)
+			}
+		}
+	} else {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			log.Warningf("Seccomp spec is being ignored")
+		}
+	}
+
 	return tg, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
 // disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
 		log.Debugf("Monitoring gofer health for container %q", cid)
 		var events []unix.PollFd
-		for _, fd := range goferFDs {
+		for _, goferFD := range goferFDs {
 			events = append(events, unix.PollFd{
-				Fd:     int32(fd),
+				Fd:     int32(goferFD.FD()),
 				Events: unix.POLLHUP | unix.POLLRDHUP,
 			})
 		}
@@ -804,18 +830,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
 			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
 		}
 
-		// Check if the gofer has stopped as part of normal container destruction.
-		// This is done just to avoid sending an annoying error message to the log.
-		// Note that there is a small race window in between mu.Unlock() and the
-		// lock being reacquired in destroyContainer(), but it's harmless to call
-		// destroyContainer() multiple times.
 		l.mu.Lock()
-		_, ok := l.processes[execID{cid: cid}]
-		l.mu.Unlock()
-		if ok {
-			log.Infof("Gofer socket disconnected, destroying container %q", cid)
-			if err := l.destroyContainer(cid); err != nil {
-				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+		defer l.mu.Unlock()
+
+		// The gofer could have been stopped due to a normal container shutdown.
+		// Check if the container has not stopped yet.
+		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+			log.Infof("Gofer socket disconnected, killing container %q", cid)
+			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+				log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
 			}
 		}
 	}()
@@ -884,17 +907,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container MountNamespace from the Task.
+	// Get the container MountNamespace from the Task. Try to acquire ref may fail
+	// in case it raced with task exit.
 	if kernel.VFS2Enabled {
-		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		// task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
 		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
-		args.MountNamespaceVFS2.IncRef()
+		if !args.MountNamespaceVFS2.TryIncRef() {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	} else {
+		var reffed bool
 		tg.Leader().WithMuLocked(func(t *kernel.Task) {
 			// task.MountNamespace() does not take a ref, so we must do so ourselves.
 			args.MountNamespace = t.MountNamespace()
-			args.MountNamespace.IncRef()
+			reffed = args.MountNamespace.TryIncRef()
 		})
+		if !reffed {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	}
 
 	// Add the HOME environment variable if it is not already set.
@@ -902,7 +932,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		root := args.MountNamespaceVFS2.Root()
 		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
 		defer args.MountNamespaceVFS2.DecRef(ctx)
-		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
@@ -1017,17 +1046,17 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
 	switch conf.Network {
-	case NetworkHost:
+	case config.NetworkHost:
 		// No network namespacing support for hostinet yet, hence creator is nil.
 		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
-	case NetworkNone, NetworkSandbox:
+	case config.NetworkNone, config.NetworkSandbox:
 		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
 		if err != nil {
 			return nil, err
@@ -1045,8 +1074,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
 }
 
 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
-	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
 	s := netstack.Stack{stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
@@ -1060,17 +1089,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	})}
 
 	// Enable SACK Recovery.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	{
+		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+		}
+		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+		}
+	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	return &s, nil
@@ -1266,7 +1308,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
 	return ep.tty, ep.ttyVFS2, nil
 }
 
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if len(stdioFDs) != 3 {
 		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index aa3fdf96c..b77b4762e 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
 
@@ -43,15 +45,19 @@ func init() {
 	if err := fsgofer.OpenProcSelfFD(); err != nil {
 		panic(err)
 	}
+	config.RegisterFlags()
 }
 
-func testConfig() *Config {
-	return &Config{
-		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
-		DisableSeccomp: true,
-		Platform:       "ptrace",
+func testConfig() *config.Config {
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.RootDir = "unused_root_dir"
+	conf.Network = config.NetworkNone
+	conf.DisableSeccomp = true
+	return conf
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -258,9 +264,9 @@ type CreateMountTestcase struct {
 	expectedPaths []string
 }
 
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
 	testCases := []*CreateMountTestcase{
-		&CreateMountTestcase{
+		{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -298,11 +304,10 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 					},
 				},
 			},
-			// /some/deep/path should be mounted, along with /proc,
-			// /dev, and /sys.
+			// /some/deep/path should be mounted, along with /proc, /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -346,7 +351,7 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -389,46 +394,42 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-	}
-
-	vfsCase := &CreateMountTestcase{
-		name: "mounts inside mandatory mounts",
-		spec: specs.Spec{
-			Root: &specs.Root{
-				Path:     os.TempDir(),
-				Readonly: true,
-			},
-			Mounts: []specs.Mount{
-				{
-					Destination: "/proc",
-					Type:        "tmpfs",
+		{
+			name: "mounts inside mandatory mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
 				},
-				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
-				//  MkDirAt in VFS2 (and remove the reduntant append).
-				// {
-				//		Destination: "/sys/bar",
-				//		Type:        "tmpfs",
-				//	},
-				//
-				{
-					Destination: "/tmp/baz",
-					Type:        "tmpfs",
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/sys/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/tmp/baz",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/goo",
+						Type:        "tmpfs",
+					},
 				},
 			},
+			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"},
 		},
-		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	if !vfs2 {
-		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
-		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
-	}
-	return append(testCases, vfsCase)
+	return testCases
 }
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	for _, tc := range createMountTestcases(false /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -439,7 +440,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -465,7 +466,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespaceVFS2(t *testing.T) {
-	for _, tc := range createMountTestcases(true /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			spec := testSpec()
 			spec.Mounts = tc.spec.Mounts
@@ -485,12 +486,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				target := &vfs.PathOperation{
@@ -545,7 +547,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -599,7 +601,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
 							Dev:        "9pfs-/dev/fd-foo",
@@ -657,7 +659,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -697,7 +699,11 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			var ioFDs []*fd.FD
+			for _, ioFD := range tc.ioFDs {
+				ioFDs = append(ioFDs, fd.New(ioFD))
+			}
+			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 4e1fa7665..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 var (
@@ -78,44 +79,6 @@ type DefaultRoute struct {
 	Name  string
 }
 
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -127,7 +90,7 @@ type FDBasedLink struct {
 	TXChecksumOffload  bool
 	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
-	QDisc              QueueingDiscipline
+	QDisc              config.QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -247,8 +210,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		switch link.QDisc {
-		case QDiscNone:
-		case QDiscFIFO:
+		case config.QDiscNone:
+		case config.QDiscFIFO:
 			log.Infof("Enabling FIFO QDisc on %q", link.Name)
 			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
 		}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
 package boot
 
 import (
+	"strings"
+
 	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
 	// We must initialize even if strace is not enabled.
 	strace.Initialize()
 
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
 		strace.EnableAll(strace.SinkTypeLog)
 		return nil
 	}
-	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+	return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 08dce8b6c..b157387ef 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,12 +16,12 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
@@ -42,6 +42,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func registerFilesystems(k *kernel.Kernel) error {
@@ -133,8 +134,8 @@ func registerFilesystems(k *kernel.Kernel) error {
 	return nil
 }
 
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
@@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -168,35 +169,141 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	}
 	rootProcArgs.MountNamespaceVFS2 = mns
 
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(rootCtx)
+	if root.Mount().ReadOnly() {
+		// Switch to ReadWrite while we setup submounts.
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+		}
+		// Restore back to ReadOnly at the end.
+		defer func() {
+			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+			}
+		}()
+	}
+
 	// Mount submounts.
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
 		// can only send mount options for specs.Mounts (specs.Root is missing
 		// Options field). So assume root is always on top of overlayfs.
-		opts = append(opts, "overlayfs_stale_read")
+		data = append(data, "overlayfs_stale_read")
 	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
-		Data: strings.Join(opts, ","),
-	})
+	opts := &vfs.MountOptions{
+		ReadOnly: c.root.Readonly,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+			InternalData: gofer.InternalFilesystemOptions{
+				UniqueID: "/",
+			},
+		},
+		InternalMount: true,
+	}
+
+	fsName := gofer.Name
+	if conf.Overlay && !c.root.Readonly {
+		log.Infof("Adding overlay on top of root")
+		var err error
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting root with overlay: %w", err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+	// First copy options from lower layer to upper layer and overlay. Clear
+	// filesystem specific options.
+	upperOpts := *lowerOpts
+	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	overlayOpts := *lowerOpts
+	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	// Next mount upper and lower. Upper is a tmpfs mount to keep all
+	// modifications inside the sandbox.
+	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+	}
+	cu := cleanup.Make(func() { upper.DecRef(ctx) })
+	defer cu.Clean()
+
+	// All writes go to the upper layer, be paranoid and make lower readonly.
+	lowerOpts.ReadOnly = true
+	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	cu.Add(func() { lower.DecRef(ctx) })
+
+	// Propagate the lower layer's root's owner, group, and mode to the upper
+	// layer's root for consistency with VFS1.
+	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
+	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRootVD,
+		Start: lowerRootVD,
+	}, &vfs.StatOptions{
+		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE,
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
+		Root:  upperRootVD,
+		Start: upperRootVD,
+	}, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
+			UID:  stat.UID,
+			GID:  stat.GID,
+			Mode: stat.Mode,
+		},
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Configure overlay with both layers.
+	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+		UpperRoot:  upperRootVD,
+		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
+	}
+	return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 	mounts, err := c.prepareMountsVFS2()
 	if err != nil {
 		return err
@@ -205,15 +312,35 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		var (
+			mnt *vfs.Mount
+			err error
+		)
+
 		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
-			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+			if err != nil {
 				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
 			}
 		} else {
-			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+			if err != nil {
 				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
 			}
 		}
+
+		if mnt != nil && mnt.ReadOnly() {
+			// Switch to ReadWrite while we setup submounts.
+			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+			}
+			// Restore back to ReadOnly at the end.
+			defer func() {
+				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+				}
+			}()
+		}
 	}
 
 	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
@@ -256,38 +383,54 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	if err != nil {
+		return nil, fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if len(fsName) == 0 {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil, nil
+	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
+	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of mount %q", submount.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse(submount.Destination),
 	}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
 	if err != nil {
-		return fmt.Errorf("mountOptions failed: %w", err)
-	}
-	if len(fsName) == 0 {
-		// Filesystem is not supported (e.g. cgroup), just skip it.
-		return nil
-	}
-
-	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
-		return err
-	}
-	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
-		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
 	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
-	return nil
+	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
 	fsName := m.Type
+	useOverlay := false
 	var data []string
+	var iopts interface{}
 
 	// Find filesystem name and FS specific data field.
 	switch m.Type {
@@ -301,7 +444,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		var err error
 		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
-			return "", nil, err
+			return "", nil, false, err
 		}
 
 	case bind:
@@ -309,18 +452,25 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		if m.fd == 0 {
 			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
 			// but unlikely to be correct in this context.
-			return "", nil, fmt.Errorf("9P mount requires a connection FD")
+			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
 		}
 		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+		iopts = gofer.InternalFilesystemOptions{
+			UniqueID: m.Destination,
+		}
+
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
-		return "", nil, nil
+		return "", nil, false, nil
 	}
 
 	opts := &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
-			Data: strings.Join(data, ","),
+			Data:         strings.Join(data, ","),
+			InternalData: iopts,
 		},
 		InternalMount: true,
 	}
@@ -340,38 +490,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		}
 	}
 
-	if conf.Overlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		opts.ReadOnly = true
-	}
-	return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(currentPath),
-	}
-	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
-	if err == nil {
-		log.Debugf("Mount point %q already exists", currentPath)
-		return nil
-	}
-	if err != syserror.ENOENT {
-		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
-	}
-
-	// Recurse to ensure parent is created and then create the mount point.
-	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
-		return err
-	}
-	log.Debugf("Creating dir %q for mount point", currentPath)
-	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
-		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
-	}
-	return nil
+	return fsName, opts, useOverlay, nil
 }
 
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -383,7 +502,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
 	for _, m := range c.mounts {
 		// m.Destination has been cleaned, so it's to use equality here.
 		if m.Destination == "/tmp" {
@@ -393,6 +512,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 	}
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	pop := vfs.PathOperation{
 		Root:  root,
@@ -434,7 +554,8 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		return err
 
 	case syserror.ENOTDIR:
 		// Not a dir?! Let it be.
@@ -448,7 +569,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 // processHintsVFS2 processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -469,51 +590,106 @@ func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credential
 
 // mountSharedMasterVFS2 mounts the master of a volume that is shared among
 // containers in a pod.
-func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	mntFD := &mountAndFD{Mount: hint.mount}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
 	if err != nil {
 		return nil, err
 	}
 	if len(fsName) == 0 {
 		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
 	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
 	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
 }
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
 	if err := source.checkCompatible(mount); err != nil {
-		return err
+		return nil, err
 	}
 
-	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	// Ignore data and useOverlay because these were already applied to
+	// the master mount.
+	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
 	if err != nil {
-		return err
+		return nil, err
 	}
 	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer newMnt.DecRef(ctx)
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
-	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
-		return err
-	}
-
 	target := &vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse(mount.Destination),
 	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+	}
+
 	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
-		return err
+		return nil, err
 	}
 	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
-	return nil
+	return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dest),
+	}
+	// First check if mount point exists. When overlay is enabled, gofer doesn't
+	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
+	// because MkdirAt fails with EROFS even if file exists.
+	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+	if err == nil {
+		// File exists, we're done.
+		vd.DecRef(ctx)
+		return nil
+	}
+	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
+}
+
+// configureRestore returns an updated context.Context including filesystem
+// state used by restore defined by conf.
+func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) {
+	fdmap := make(map[string]int)
+	fdmap["/"] = c.fds.remove()
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return ctx, err
+	}
+	for i := range c.mounts {
+		submount := &mounts[i]
+		if submount.fd >= 0 {
+			fdmap[submount.Destination] = submount.fd
+		}
+	}
+	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
 }
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 8fbc3887a..56da21584 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -201,13 +201,15 @@ func LoadPaths(pid string) (map[string]string, error) {
 	paths := make(map[string]string)
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
-		// Format: ID:controller1,controller2:path
+		// Format: ID:[name=]controller1,controller2:path
 		// Example: 2:cpu,cpuacct:/user.slice
 		tokens := strings.Split(scanner.Text(), ":")
 		if len(tokens) != 3 {
 			return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
 		}
 		for _, ctrlr := range strings.Split(tokens[1], ",") {
+			// Remove prefix for cgroups with no controller, eg. systemd.
+			ctrlr = strings.TrimPrefix(ctrlr, "name=")
 			paths[ctrlr] = tokens[2]
 		}
 	}
@@ -237,7 +239,7 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 		var err error
 		parents, err = LoadPaths("self")
 		if err != nil {
-			return nil, fmt.Errorf("finding current cgroups: %v", err)
+			return nil, fmt.Errorf("finding current cgroups: %w", err)
 		}
 	}
 	return &Cgroup{
@@ -276,10 +278,8 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 			}
 			return err
 		}
-		if res != nil {
-			if err := cfg.ctrlr.set(res, path); err != nil {
-				return err
-			}
+		if err := cfg.ctrlr.set(res, path); err != nil {
+			return err
 		}
 	}
 	clean.Release()
@@ -304,14 +304,15 @@ func (c *Cgroup) Uninstall() error {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
 		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
-		if err := backoff.Retry(func() error {
+		fn := func() error {
 			err := syscall.Rmdir(path)
 			if os.IsNotExist(err) {
 				return nil
 			}
 			return err
-		}, b); err != nil {
-			return fmt.Errorf("removing cgroup path %q: %v", path, err)
+		}
+		if err := backoff.Retry(fn, b); err != nil {
+			return fmt.Errorf("removing cgroup path %q: %w", path, err)
 		}
 	}
 	return nil
@@ -332,7 +333,6 @@ func (c *Cgroup) Join() (func(), error) {
 		if _, ok := controllers[ctrlr]; ok {
 			fullPath := filepath.Join(cgroupRoot, ctrlr, path)
 			undoPaths = append(undoPaths, fullPath)
-			break
 		}
 	}
 
@@ -422,7 +422,7 @@ func (*noop) set(*specs.LinuxResources, string) error {
 type memory struct{}
 
 func (*memory) set(spec *specs.LinuxResources, path string) error {
-	if spec.Memory == nil {
+	if spec == nil || spec.Memory == nil {
 		return nil
 	}
 	if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
@@ -455,7 +455,7 @@ func (*memory) set(spec *specs.LinuxResources, path string) error {
 type cpu struct{}
 
 func (*cpu) set(spec *specs.LinuxResources, path string) error {
-	if spec.CPU == nil {
+	if spec == nil || spec.CPU == nil {
 		return nil
 	}
 	if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
@@ -478,7 +478,7 @@ type cpuSet struct{}
 func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
 	// If not set in the spec, get it from one of the ancestors cgroup.
-	if spec.CPU == nil || spec.CPU.Cpus == "" {
+	if spec == nil || spec.CPU == nil || spec.CPU.Cpus == "" {
 		if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
 			return err
 		}
@@ -488,18 +488,17 @@ func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 		}
 	}
 
-	if spec.CPU == nil || spec.CPU.Mems == "" {
+	if spec == nil || spec.CPU == nil || spec.CPU.Mems == "" {
 		_, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
 		return err
 	}
-	mems := spec.CPU.Mems
-	return setValue(path, "cpuset.mems", mems)
+	return setValue(path, "cpuset.mems", spec.CPU.Mems)
 }
 
 type blockIO struct{}
 
 func (*blockIO) set(spec *specs.LinuxResources, path string) error {
-	if spec.BlockIO == nil {
+	if spec == nil || spec.BlockIO == nil {
 		return nil
 	}
 
@@ -549,7 +548,7 @@ func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
 type networkClass struct{}
 
 func (*networkClass) set(spec *specs.LinuxResources, path string) error {
-	if spec.Network == nil {
+	if spec == nil || spec.Network == nil {
 		return nil
 	}
 	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
@@ -558,7 +557,7 @@ func (*networkClass) set(spec *specs.LinuxResources, path string) error {
 type networkPrio struct{}
 
 func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
-	if spec.Network == nil {
+	if spec == nil || spec.Network == nil {
 		return nil
 	}
 	for _, prio := range spec.Network.Priorities {
@@ -573,7 +572,7 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 type pids struct{}
 
 func (*pids) set(spec *specs.LinuxResources, path string) error {
-	if spec.Pids == nil || spec.Pids.Limit <= 0 {
+	if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 {
 		return nil
 	}
 	val := strconv.FormatInt(spec.Pids.Limit, 10)
@@ -583,6 +582,9 @@ func (*pids) set(spec *specs.LinuxResources, path string) error {
 type hugeTLB struct{}
 
 func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
+	if spec == nil {
+		return nil
+	}
 	for _, limit := range spec.HugepageLimits {
 		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
 		val := strconv.FormatUint(limit.Limit, 10)
diff --git a/runsc/cli/BUILD b/runsc/cli/BUILD
new file mode 100644
index 000000000..32cce2a18
--- /dev/null
+++ b/runsc/cli/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cli",
+    srcs = ["main.go"],
+    visibility = [
+        "//:__pkg__",
+        "//runsc:__pkg__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/platform",
+        "//runsc/cmd",
+        "//runsc/config",
+        "//runsc/flag",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
new file mode 100644
index 000000000..bca015db5
--- /dev/null
+++ b/runsc/cli/main.go
@@ -0,0 +1,256 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cli is the main entrypoint for runsc.
+package cli
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/runsc/cmd"
+	"gvisor.dev/gvisor/runsc/config"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+	// Although these flags are not part of the OCI spec, they are used by
+	// Docker, and thus should not be changed.
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
+	showVersion   = flag.Bool("version", false, "show version and exit.")
+
+	// These flags are unique to runsc, and are used to configure parts of the
+	// system that are not covered by the runtime spec.
+
+	// Debugging flags.
+	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
+)
+
+// Main is the main entrypoint.
+func Main(version string) {
+	// Help and flags commands are generated automatically.
+	help := cmd.NewHelp(subcommands.DefaultCommander)
+	help.Register(new(cmd.Syscalls))
+	subcommands.Register(help, "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+
+	// Installation helpers.
+	const helperGroup = "helpers"
+	subcommands.Register(new(cmd.Install), helperGroup)
+	subcommands.Register(new(cmd.Uninstall), helperGroup)
+
+	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Checkpoint), "")
+	subcommands.Register(new(cmd.Create), "")
+	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Do), "")
+	subcommands.Register(new(cmd.Events), "")
+	subcommands.Register(new(cmd.Exec), "")
+	subcommands.Register(new(cmd.Gofer), "")
+	subcommands.Register(new(cmd.Kill), "")
+	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.Pause), "")
+	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Restore), "")
+	subcommands.Register(new(cmd.Resume), "")
+	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Spec), "")
+	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.Wait), "")
+
+	// Register internal commands with the internal group name. This causes
+	// them to be sorted below the user-facing commands with empty group.
+	// The string below will be printed above the commands.
+	const internalGroup = "internal use only"
+	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Debug), internalGroup)
+	subcommands.Register(new(cmd.Gofer), internalGroup)
+	subcommands.Register(new(cmd.Statefile), internalGroup)
+
+	config.RegisterFlags()
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	// Are we showing the version?
+	if *showVersion {
+		// The format here is the same as runc.
+		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
+		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
+		os.Exit(0)
+	}
+
+	// Create a new Config from the flags.
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		cmd.Fatalf(err.Error())
+	}
+
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	if *systemdCgroup {
+		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+		os.Exit(1)
+	}
+
+	var errorLogger io.Writer
+	if *logFD > -1 {
+		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+	} else if conf.LogFilename != "" {
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		var err error
+		errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", conf.LogFilename, err)
+		}
+	}
+	cmd.ErrorLogger = errorLogger
+
+	if _, err := platform.Lookup(conf.Platform); err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	// Sets the reference leak check mode. Also set it in config below to
+	// propagate it to child processes.
+	refs.SetLeakMode(conf.ReferenceLeak)
+
+	// Set up logging.
+	if conf.Debug {
+		log.SetLevel(log.Debug)
+	}
+
+	// Logging will include the local date and time via the time package.
+	//
+	// On first use, time.Local initializes the local time zone, which
+	// involves opening tzdata files on the host. Since this requires
+	// opening host files, it must be done before syscall filter
+	// installation.
+	//
+	// Generally there will be a log message before filter installation
+	// that will force initialization, but force initialization here in
+	// case that does not occur.
+	_ = time.Local.String()
+
+	subcommand := flag.CommandLine.Arg(0)
+
+	var e log.Emitter
+	if *debugLogFD > -1 {
+		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+
+		e = newEmitter(conf.DebugLogFormat, f)
+
+	} else if conf.DebugLog != "" {
+		f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */)
+		if err != nil {
+			cmd.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err)
+		}
+		e = newEmitter(conf.DebugLogFormat, f)
+
+	} else {
+		// Stderr is reserved for the application, just discard the logs if no debug
+		// log is specified.
+		e = newEmitter("text", ioutil.Discard)
+	}
+
+	if *panicLogFD > -1 || *debugLogFD > -1 {
+		fd := *panicLogFD
+		if fd < 0 {
+			fd = *debugLogFD
+		}
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
+		}
+	} else if conf.AlsoLogToStderr {
+		e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)}
+	}
+
+	log.SetTarget(e)
+
+	log.Infof("***************************")
+	log.Infof("Args: %s", os.Args)
+	log.Infof("Version %s", version)
+	log.Infof("PID: %d", os.Getpid())
+	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+	log.Infof("Configuration:")
+	log.Infof("\t\tRootDir: %s", conf.RootDir)
+	log.Infof("\t\tPlatform: %v", conf.Platform)
+	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
+	log.Infof("***************************")
+
+	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// SIGTERM is sent to all processes if a test exceeds its
+		// timeout and this case is handled by syscall_test_runner.
+		log.Warningf("Block the TERM signal. This is only safe in tests!")
+		signal.Ignore(syscall.SIGTERM)
+	}
+
+	// Call the subcommand and pass in the configuration.
+	var ws syscall.WaitStatus
+	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	if subcmdCode == subcommands.ExitSuccess {
+		log.Infof("Exiting with status: %v", ws)
+		if ws.Signaled() {
+			// No good way to return it, emulate what the shell does. Maybe raise
+			// signal to self?
+			os.Exit(128 + int(ws.Signal()))
+		}
+		os.Exit(ws.ExitStatus())
+	}
+	// Return an error that is unlikely to be used by the application.
+	log.Warningf("Failure to execute command, err: %v", subcmdCode)
+	os.Exit(128)
+}
+
+func newEmitter(format string, logFile io.Writer) log.Emitter {
+	switch format {
+	case "text":
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
+	case "json-k8s":
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
+	}
+	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
+	panic("unreachable")
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 1b5178dd5..2556f6d9e 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/container",
         "//runsc/flag",
@@ -84,7 +85,7 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/test/testutil",
         "//pkg/urpc",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/container",
         "//runsc/specutils",
         "@com_github_google_go_cmp//cmp:go_default_library",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index f4f247721..cd419e1aa 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -133,7 +134,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("system")
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if b.attached {
 		// Ensure this process is killed after parent process terminates when
@@ -167,7 +168,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Get the spec from the specFD.
 	specFile := os.NewFile(uintptr(b.specFD), "spec file")
 	defer specFile.Close()
-	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index a84067112..e13a94486 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -24,7 +24,7 @@ import (
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -88,7 +88,7 @@ func TestCapabilities(t *testing.T) {
 	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
-	conf.Network = boot.NetworkHost
+	conf.Network = config.NetworkHost
 
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 8a29e521e..8fe0c427a 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -72,7 +72,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	cont, err := container.Load(conf.RootDir, id)
@@ -118,7 +118,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("setting bundleDir")
 	}
 
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 910e97577..e76f7ba1d 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -81,7 +81,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if conf.Rootless {
 		return Errorf("Rootless mode not supported with %q", c.Name())
@@ -91,7 +91,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 742f8c344..132198222 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -25,7 +25,7 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -82,7 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.
 func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	var c *container.Container
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if d.pid == 0 {
 		// No pid, container ID must have been provided.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 0e4863f50..4e49deff8 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -59,14 +59,14 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	if err := d.execute(f.Args(), conf); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
 }
 
-func (d *Delete) execute(ids []string, conf *boot.Config) error {
+func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
 		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index cb59516a3..e2d994a05 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -18,7 +18,7 @@ import (
 	"io/ioutil"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestNotFound(t *testing.T) {
@@ -27,7 +27,7 @@ func TestNotFound(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error creating dir: %v", err)
 	}
-	conf := &boot.Config{RootDir: dir}
+	conf := &config.Config{RootDir: dir}
 
 	d := Delete{}
 	if err := d.execute(ids, conf); err == nil {
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 7d1310c96..640de4c47 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -17,6 +17,7 @@ package cmd
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"math/rand"
@@ -30,12 +31,14 @@ import (
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
+var errNoDefaultInterface = errors.New("no default interface found")
+
 // Do implements subcommands.Command for the "do" command. It sets up a simple
 // sandbox and executes the command inside it. See Usage() for more details.
 type Do struct {
@@ -82,7 +85,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -125,27 +128,29 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if conf.Network == boot.NetworkNone {
-		netns := specs.LinuxNamespace{
-			Type: specs.NetworkNamespace,
-		}
-		if spec.Linux != nil {
-			panic("spec.Linux is not nil")
-		}
-		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+	if conf.Network == config.NetworkNone {
+		addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
 
 	} else if conf.Rootless {
-		if conf.Network == boot.NetworkSandbox {
-			c.notifyUser("*** Warning: using host network due to --rootless ***")
-			conf.Network = boot.NetworkHost
+		if conf.Network == config.NetworkSandbox {
+			c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***")
+			conf.Network = config.NetworkHost
 		}
 
 	} else {
-		clean, err := c.setupNet(cid, spec)
-		if err != nil {
+		switch clean, err := c.setupNet(cid, spec); err {
+		case errNoDefaultInterface:
+			log.Warningf("Network interface not found, using internal network")
+			addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+			conf.Network = config.NetworkHost
+
+		case nil:
+			// Setup successfull.
+			defer clean()
+
+		default:
 			return Errorf("Error setting up network: %v", err)
 		}
-		defer clean()
 	}
 
 	out, err := json.Marshal(spec)
@@ -199,6 +204,13 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	return subcommands.ExitSuccess
 }
 
+func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) {
+	if spec.Linux == nil {
+		spec.Linux = &specs.Linux{}
+	}
+	spec.Linux.Namespaces = append(spec.Linux.Namespaces, ns)
+}
+
 func (c *Do) notifyUser(format string, v ...interface{}) {
 	if !c.quiet {
 		fmt.Printf(format+"\n", v...)
@@ -219,10 +231,14 @@ func resolvePath(path string) (string, error) {
 	return path, nil
 }
 
+// setupNet setups up the sandbox network, including the creation of a network
+// namespace, and iptable rules to redirect the traffic. Returns a cleanup
+// function to tear down the network. Returns errNoDefaultInterface when there
+// is no network interface available to setup the network.
 func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 	dev, err := defaultDevice()
 	if err != nil {
-		return nil, err
+		return nil, errNoDefaultInterface
 	}
 	peerIP, err := calculatePeerIP(c.ip)
 	if err != nil {
@@ -279,14 +295,11 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 		return nil, err
 	}
 
-	if spec.Linux == nil {
-		spec.Linux = &specs.Linux{}
-	}
 	netns := specs.LinuxNamespace{
 		Type: specs.NetworkNamespace,
 		Path: filepath.Join("/var/run/netns", cid),
 	}
-	spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
+	addNamespace(spec, netns)
 
 	return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil
 }
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 51f6a98ed..25fe2cf1c 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -72,7 +72,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index d9a94903e..775ed4b43 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -33,7 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -105,7 +105,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute. It starts a process in an
 // already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	e, id, err := ex.parseArgs(f, conf.EnableRaw)
 	if err != nil {
 		Fatalf("parsing process spec: %v", err)
@@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
+	// pty master/replica pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
 		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
@@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 		}
 		defer tty.Close()
 
-		// Set stdio to the new TTY slave.
+		// Set stdio to the new TTY replica.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 3966e2d21..371fcc0ae 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -30,7 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 	"gvisor.dev/gvisor/runsc/fsgofer/filter"
@@ -62,9 +62,8 @@ type Gofer struct {
 	applyCaps bool
 	setUpRoot bool
 
-	panicOnWrite bool
-	specFD       int
-	mountsFD     int
+	specFD   int
+	mountsFD int
 }
 
 // Name implements subcommands.Command.
@@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
-	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
 	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
 	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
@@ -100,15 +98,15 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
+	conf := args[0].(*config.Config)
+
 	specFile := os.NewFile(uintptr(g.specFD), "spec file")
 	defer specFile.Close()
-	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
 
-	conf := args[0].(*boot.Config)
-
 	if g.setUpRoot {
 		if err := setupRootFS(spec, conf); err != nil {
 			Fatalf("Error setting up root FS: %v", err)
@@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
 	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
-		ROMount:      spec.Root.Readonly || conf.Overlay,
-		PanicOnWrite: g.panicOnWrite,
+		ROMount: spec.Root.Readonly || conf.Overlay,
 	})
 	if err != nil {
 		Fatalf("creating attach point: %v", err)
@@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			cfg := fsgofer.Config{
-				ROMount:      isReadonlyMount(m.Options) || conf.Overlay,
-				PanicOnWrite: g.panicOnWrite,
-				HostUDS:      conf.FSGoferHostUDS,
+				ROMount: isReadonlyMount(m.Options) || conf.Overlay,
+				HostUDS: conf.FSGoferHostUDS,
 			}
 			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
 			if err != nil {
@@ -263,7 +259,7 @@ func isReadonlyMount(opts []string) bool {
 	return false
 }
 
-func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+func setupRootFS(spec *specs.Spec, conf *config.Config) error {
 	// Convert all shared mounts into slaves to be sure that nothing will be
 	// propagated outside of our namespace.
 	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
@@ -316,6 +312,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 		if err != nil {
 			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
 		}
+		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
 		if err := os.MkdirAll(dst, 0755); err != nil {
 			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
 		}
@@ -346,7 +343,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 // setupMounts binds mount all mounts specified in the spec in their correct
 // location inside root. It will resolve relative paths and symlinks. It also
 // creates directories as needed.
-func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
+func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
@@ -385,7 +382,7 @@ func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
 // Otherwise, it may follow symlinks to locations that would be overwritten
 // with another mount point and return the wrong location. In short, make sure
 // setupMounts() has been called before.
-func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -467,7 +464,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 }
 
 // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
-func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
 	rv := make([]string, len(opts))
 	copy(rv, opts)
 
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 8282ea0e0..04eee99b2 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if k.pid != 0 && k.all {
 		Fatalf("it is invalid to specify both --all and --pid")
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index d8d906fe3..f92d6fef9 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,7 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	ids, err := container.List(conf.RootDir)
 	if err != nil {
 		Fatalf("%v", err)
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 6f95a9837..0eb1402ed 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -53,7 +53,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 7fb8041af..bc58c928f 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -20,7 +20,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -58,7 +58,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 72584b326..096ec814c 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -20,7 +20,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -77,7 +77,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -88,7 +88,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 61a55a554..f24823f99 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -54,7 +54,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index cf41581ad..c48cbe4cd 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -64,7 +64,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 0205fd9f7..139edbd49 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -18,9 +18,10 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 // Start implements subcommands.Command for the "start" command.
@@ -52,12 +53,18 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
+	// Read the spec again here to ensure flag annotations from the spec are
+	// applied to "conf".
+	if _, err := specutils.ReadSpec(c.BundleDir, conf); err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+
 	if err := c.Start(conf); err != nil {
 		Fatalf("starting container: %v", err)
 	}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index cf2413deb..2bd2ab9f8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -55,7 +55,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 29c0a15f0..28d0642ed 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -21,7 +21,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -70,7 +70,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
new file mode 100644
index 000000000..b1672bb9d
--- /dev/null
+++ b/runsc/config/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "config",
+    srcs = [
+        "config.go",
+        "flags.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/watchdog",
+        "//pkg/sync",
+        "//runsc/flag",
+    ],
+)
+
+go_test(
+    name = "config_test",
+    size = "small",
+    srcs = [
+        "config_test.go",
+    ],
+    library = ":config",
+    deps = ["//runsc/flag"],
+)
diff --git a/runsc/boot/config.go b/runsc/config/config.go
index 80da8b3e6..f30f79f68 100644
--- a/runsc/boot/config.go
+++ b/runsc/config/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,220 +12,112 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+// Package config provides basic infrastructure to set configuration settings
+// for runsc. The configuration is set by flags to the command line. They can
+// also propagate to a different process using the same flags.
+package config
 
 import (
 	"fmt"
-	"strconv"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 )
 
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
 // Config holds configuration that is not part of the runtime spec.
+//
+// Follow these steps to add a new flag:
+//   1. Create a new field in Config.
+//   2. Add a field tag with the flag name
+//   3. Register a new flag in flags.go, with name and description
+//   4. Add any necessary validation into validate()
+//   5. If adding an enum, follow the same pattern as FileAccessType
+//
 type Config struct {
 	// RootDir is the runtime root directory.
-	RootDir string
+	RootDir string `flag:"root"`
 
 	// Debug indicates that debug logging should be enabled.
-	Debug bool
+	Debug bool `flag:"debug"`
 
 	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
+	LogFilename string `flag:"log"`
 
 	// LogFormat is the log format.
-	LogFormat string
+	LogFormat string `flag:"log-format"`
 
 	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
+	DebugLog string `flag:"debug-log"`
 
 	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
+	PanicLog string `flag:"panic-log"`
 
 	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
+	DebugLogFormat string `flag:"debug-log-format"`
 
 	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
+	FileAccess FileAccessType `flag:"file-access"`
 
 	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
+	Overlay bool `flag:"overlay"`
 
 	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
+	FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
 
 	// Network indicates what type of network to use.
-	Network NetworkType
+	Network NetworkType `flag:"network"`
 
 	// EnableRaw indicates whether raw sockets should be enabled. Raw
 	// sockets are disabled by stripping CAP_NET_RAW from the list of
 	// capabilities.
-	EnableRaw bool
+	EnableRaw bool `flag:"net-raw"`
 
 	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
+	HardwareGSO bool `flag:"gso"`
 
 	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
+	SoftwareGSO bool `flag:"software-gso"`
 
 	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
-	TXChecksumOffload bool
+	TXChecksumOffload bool `flag:"tx-checksum-offload"`
 
 	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
-	RXChecksumOffload bool
+	RXChecksumOffload bool `flag:"rx-checksum-offload"`
 
 	// QDisc indicates the type of queuening discipline to use by default
 	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
+	QDisc QueueingDiscipline `flag:"qdisc"`
 
 	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
+	LogPackets bool `flag:"log-packets"`
 
 	// Platform is the platform to run on.
-	Platform string
+	Platform string `flag:"platform"`
 
 	// Strace indicates that strace should be enabled.
-	Strace bool
+	Strace bool `flag:"strace"`
 
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
+	// StraceSyscalls is the set of syscalls to trace (comma-separated values).
+	// If StraceEnable is true and this string is empty, then all syscalls will
+	// be traced.
+	StraceSyscalls string `flag:"strace-syscalls"`
 
 	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
+	StraceLogSize uint `flag:"strace-log-size"`
 
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
 
 	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
+	WatchdogAction watchdog.Action `flag:"watchdog-action"`
 
 	// PanicSignal registers signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
+	PanicSignal int `flag:"panic-signal"`
 
 	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
+	ProfileEnable bool `flag:"profile"`
 
 	// RestoreFile is the path to the saved container image
 	RestoreFile string
@@ -233,104 +125,215 @@ type Config struct {
 	// NumNetworkChannels controls the number of AF_PACKET sockets that map
 	// to the same underlying network device. This allows netstack to better
 	// scale for high throughput use cases.
-	NumNetworkChannels int
+	NumNetworkChannels int `flag:"num-network-channels"`
 
 	// Rootless allows the sandbox to be started with a user that is not root.
 	// Defense is depth measures are weaker with rootless. Specifically, the
 	// sandbox and Gofer process run as root inside a user namespace with root
 	// mapped to the caller's user.
-	Rootless bool
+	Rootless bool `flag:"rootless"`
 
 	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
+	AlsoLogToStderr bool `flag:"alsologtostderr"`
 
 	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
+	ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
 
 	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
 	// is on a Linux overlayfs mount, which does not necessarily preserve
 	// coherence between read-only and subsequent writable file descriptors
 	// representing the "same" file.
-	OverlayfsStaleRead bool
+	OverlayfsStaleRead bool `flag:"overlayfs-stale-read"`
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
+
+	// Enables VFS2.
+	VFS2 bool `flag:"vfs2"`
+
+	// Enables FUSE usage.
+	FUSE bool `flag:"fuse"`
+
+	// Allows overriding of flags in OCI annotations.
+	AllowFlagOverride bool `flag:"allow-flag-override"`
+
+	// Enables seccomp inside the sandbox.
+	OCISeccomp bool `flag:"oci-seccomp"`
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
 	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
 
 	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
 	// test name in the container environment variables and adds it to the debug
 	// log file name. This is done to help identify the log with the test when
 	// multiple tests are run in parallel, since there is no way to pass
 	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
+	TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
+}
 
-	// CPUNumFromQuota sets CPU number count to available CPU quota, using
-	// least integer value greater than or equal to quota.
-	//
-	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
+func (c *Config) validate() error {
+	if c.FileAccess == FileAccessShared && c.Overlay {
+		return fmt.Errorf("overlay flag is incompatible with shared file access")
+	}
+	if c.NumNetworkChannels <= 0 {
+		return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
+	}
+	return nil
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive FileAccessType = iota
 
-	// Enables VFS2 (not plumbled through yet).
-	VFS2 bool
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared
+)
 
-	// Enables FUSE usage (not plumbled through yet).
-	FUSE bool
+func fileAccessTypePtr(v FileAccessType) *FileAccessType {
+	return &v
 }
 
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
-		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
+// Set implements flag.Value.
+func (f *FileAccessType) Set(v string) error {
+	switch v {
+	case "shared":
+		*f = FileAccessShared
+	case "exclusive":
+		*f = FileAccessExclusive
+	default:
+		return fmt.Errorf("invalid file access type %q", v)
 	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
+	return nil
+}
+
+// Get implements flag.Value.
+func (f *FileAccessType) Get() interface{} {
+	return *f
+}
+
+// String implements flag.Value.
+func (f *FileAccessType) String() string {
+	switch *f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
 	}
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	panic(fmt.Sprintf("Invalid file access type %v", *f))
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+func networkTypePtr(v NetworkType) *NetworkType {
+	return &v
+}
+
+// Set implements flag.Value.
+func (n *NetworkType) Set(v string) error {
+	switch v {
+	case "sandbox":
+		*n = NetworkSandbox
+	case "host":
+		*n = NetworkHost
+	case "none":
+		*n = NetworkNone
+	default:
+		return fmt.Errorf("invalid network type %q", v)
 	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	return nil
+}
+
+// Get implements flag.Value.
+func (n *NetworkType) Get() interface{} {
+	return *n
+}
+
+// String implements flag.Value.
+func (n *NetworkType) String() string {
+	switch *n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
 	}
+	panic(fmt.Sprintf("Invalid network type %v", *n))
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying FD.
+	QDiscFIFO
+)
+
+func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
+	return &v
+}
 
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
+// Set implements flag.Value.
+func (q *QueueingDiscipline) Set(v string) error {
+	switch v {
+	case "none":
+		*q = QDiscNone
+	case "fifo":
+		*q = QDiscFIFO
+	default:
+		return fmt.Errorf("invalid qdisc %q", v)
 	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (q *QueueingDiscipline) Get() interface{} {
+	return *q
+}
 
-	if c.FUSE {
-		f = append(f, "--fuse=true")
+// String implements flag.Value.
+func (q *QueueingDiscipline) String() string {
+	switch *q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
 	}
+	panic(fmt.Sprintf("Invalid qdisc %v", *q))
+}
+
+func leakModePtr(v refs.LeakMode) *refs.LeakMode {
+	return &v
+}
 
-	return f
+func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
+	return &v
 }
diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go
new file mode 100644
index 000000000..fb162b7eb
--- /dev/null
+++ b/runsc/config/config_test.go
@@ -0,0 +1,272 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+func init() {
+	RegisterFlags()
+}
+
+func TestDefault(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// "--root" is always set to something different than the default. Reset it
+	// to make it easier to test that default values do not generate flags.
+	c.RootDir = ""
+
+	// All defaults doesn't require setting flags.
+	flags := c.ToFlags()
+	if len(flags) > 0 {
+		t.Errorf("default flags not set correctly for: %s", flags)
+	}
+}
+
+func setDefault(name string) {
+	fl := flag.CommandLine.Lookup(name)
+	fl.Value.Set(fl.DefValue)
+}
+
+func TestFromFlags(t *testing.T) {
+	flag.CommandLine.Lookup("root").Value.Set("some-path")
+	flag.CommandLine.Lookup("debug").Value.Set("true")
+	flag.CommandLine.Lookup("num-network-channels").Value.Set("123")
+	flag.CommandLine.Lookup("network").Value.Set("none")
+	defer func() {
+		setDefault("root")
+		setDefault("debug")
+		setDefault("num-network-channels")
+		setDefault("network")
+	}()
+
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := "some-path"; c.RootDir != want {
+		t.Errorf("RootDir=%v, want: %v", c.RootDir, want)
+	}
+	if want := true; c.Debug != want {
+		t.Errorf("Debug=%v, want: %v", c.Debug, want)
+	}
+	if want := 123; c.NumNetworkChannels != want {
+		t.Errorf("NumNetworkChannels=%v, want: %v", c.NumNetworkChannels, want)
+	}
+	if want := NetworkNone; c.Network != want {
+		t.Errorf("Network=%v, want: %v", c.Network, want)
+	}
+}
+
+func TestToFlags(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.RootDir = "some-path"
+	c.Debug = true
+	c.NumNetworkChannels = 123
+	c.Network = NetworkNone
+
+	flags := c.ToFlags()
+	if len(flags) != 4 {
+		t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags)
+	}
+	t.Logf("Flags: %s", flags)
+	fm := map[string]string{}
+	for _, f := range flags {
+		kv := strings.Split(f, "=")
+		fm[kv[0]] = kv[1]
+	}
+	for name, want := range map[string]string{
+		"--root":                 "some-path",
+		"--debug":                "true",
+		"--num-network-channels": "123",
+		"--network":              "none",
+	} {
+		if got, ok := fm[name]; ok {
+			if got != want {
+				t.Errorf("flag %q, want: %q, got: %q", name, want, got)
+			}
+		} else {
+			t.Errorf("flag %q not set", name)
+		}
+	}
+}
+
+// TestInvalidFlags checks that enum flags fail when value is not in enum set.
+func TestInvalidFlags(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		error string
+	}{
+		{
+			name:  "file-access",
+			error: "invalid file access type",
+		},
+		{
+			name:  "network",
+			error: "invalid network type",
+		},
+		{
+			name:  "qdisc",
+			error: "invalid qdisc",
+		},
+		{
+			name:  "watchdog-action",
+			error: "invalid watchdog action",
+		},
+		{
+			name:  "ref-leak-mode",
+			error: "invalid ref leak mode",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			defer setDefault(tc.name)
+			if err := flag.CommandLine.Lookup(tc.name).Value.Set("invalid"); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("flag.Value.Set(invalid) wrong error reported: %v", err)
+			}
+		})
+	}
+}
+
+func TestValidationFail(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		flags map[string]string
+		error string
+	}{
+		{
+			name: "shared+overlay",
+			flags: map[string]string{
+				"file-access": "shared",
+				"overlay":     "true",
+			},
+			error: "overlay flag is incompatible",
+		},
+		{
+			name: "network-channels",
+			flags: map[string]string{
+				"num-network-channels": "-1",
+			},
+			error: "num_network_channels must be > 0",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			for name, val := range tc.flags {
+				defer setDefault(name)
+				if err := flag.CommandLine.Lookup(name).Value.Set(val); err != nil {
+					t.Errorf("%s=%q: %v", name, val, err)
+				}
+			}
+			if _, err := NewFromFlags(); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("NewFromFlags() wrong error reported: %v", err)
+			}
+		})
+	}
+}
+
+func TestOverride(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.AllowFlagOverride = true
+
+	t.Run("string", func(t *testing.T) {
+		c.RootDir = "foobar"
+		if err := c.Override("root", "bar"); err != nil {
+			t.Fatalf("Override(root, bar) failed: %v", err)
+		}
+		defer setDefault("root")
+		if c.RootDir != "bar" {
+			t.Errorf("Override(root, bar) didn't work: %+v", c)
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		c.Debug = true
+		if err := c.Override("debug", "false"); err != nil {
+			t.Fatalf("Override(debug, false) failed: %v", err)
+		}
+		defer setDefault("debug")
+		if c.Debug {
+			t.Errorf("Override(debug, false) didn't work: %+v", c)
+		}
+	})
+
+	t.Run("enum", func(t *testing.T) {
+		c.FileAccess = FileAccessShared
+		if err := c.Override("file-access", "exclusive"); err != nil {
+			t.Fatalf("Override(file-access, exclusive) failed: %v", err)
+		}
+		defer setDefault("file-access")
+		if c.FileAccess != FileAccessExclusive {
+			t.Errorf("Override(file-access, exclusive) didn't work: %+v", c)
+		}
+	})
+}
+
+func TestOverrideDisabled(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	const errMsg = "flag override disabled"
+	if err := c.Override("root", "path"); err == nil || !strings.Contains(err.Error(), errMsg) {
+		t.Errorf("Override() wrong error: %v", err)
+	}
+}
+
+func TestOverrideError(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.AllowFlagOverride = true
+	for _, tc := range []struct {
+		name  string
+		value string
+		error string
+	}{
+		{
+			name:  "invalid",
+			value: "valid",
+			error: `flag "invalid" not found`,
+		},
+		{
+			name:  "debug",
+			value: "invalid",
+			error: "error setting flag debug",
+		},
+		{
+			name:  "file-access",
+			value: "invalid",
+			error: "invalid file access type",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			if err := c.Override(tc.name, tc.value); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("Override(%q, %q) wrong error: %v", tc.name, tc.value, err)
+			}
+		})
+	}
+}
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
new file mode 100644
index 000000000..a5f25cfa2
--- /dev/null
+++ b/runsc/config/flags.go
@@ -0,0 +1,205 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+var registration sync.Once
+
+// This is the set of flags used to populate Config.
+func RegisterFlags() {
+	registration.Do(func() {
+		// Although these flags are not part of the OCI spec, they are used by
+		// Docker, and thus should not be changed.
+		flag.String("root", "", "root directory for storage of container state.")
+		flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+		flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("debug", false, "enable debug logging.")
+
+		// These flags are unique to runsc, and are used to configure parts of the
+		// system that are not covered by the runtime spec.
+
+		// Debugging flags.
+		flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+		flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+		flag.Bool("log-packets", false, "enable network packet logging.")
+		flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("alsologtostderr", false, "send log messages to stderr.")
+		flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.")
+
+		// Debugging flags: strace related
+		flag.Bool("strace", false, "enable strace.")
+		flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+		flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+
+		// Flags that control sandbox runtime behavior.
+		flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
+		flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.")
+		flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+		flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+		flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+		flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+		flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+		flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.")
+
+		// Flags that control sandbox runtime behavior: FS related.
+		flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+		flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+		flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
+		flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
+		flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+		flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+
+		// Flags that control sandbox runtime behavior: network related.
+		flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+		flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+		flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+		flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
+		flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
+		flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
+		flag.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
+		flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+
+		// Test flags, not to be used outside tests, ever.
+		flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+		flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	})
+}
+
+// NewFromFlags creates a new Config with values coming from command line flags.
+func NewFromFlags() (*Config, error) {
+	conf := &Config{}
+
+	obj := reflect.ValueOf(conf).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		fl := flag.CommandLine.Lookup(name)
+		if fl == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		x := reflect.ValueOf(flag.Get(fl.Value))
+		obj.Field(i).Set(x)
+	}
+
+	if len(conf.RootDir) == 0 {
+		// If not set, set default root dir to something (hopefully) user-writeable.
+		conf.RootDir = "/var/run/runsc"
+		if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+			conf.RootDir = filepath.Join(runtimeDir, "runsc")
+		}
+	}
+
+	if err := conf.validate(); err != nil {
+		return nil, err
+	}
+	return conf, nil
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	var rv []string
+
+	obj := reflect.ValueOf(c).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		val := getVal(obj.Field(i))
+
+		flag := flag.CommandLine.Lookup(name)
+		if flag == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		if val == flag.DefValue {
+			continue
+		}
+		rv = append(rv, fmt.Sprintf("--%s=%s", flag.Name, val))
+	}
+	return rv
+}
+
+// Override writes a new value to a flag.
+func (c *Config) Override(name string, value string) error {
+	if !c.AllowFlagOverride {
+		return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it")
+	}
+
+	obj := reflect.ValueOf(c).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		fieldName, ok := f.Tag.Lookup("flag")
+		if !ok || fieldName != name {
+			// Not a flag field, or flag name doesn't match.
+			continue
+		}
+		fl := flag.CommandLine.Lookup(name)
+		if fl == nil {
+			// Flag must exist if there is a field match above.
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+
+		// Use flag to convert the string value to the underlying flag type, using
+		// the same rules as the command-line for consistency.
+		if err := fl.Value.Set(value); err != nil {
+			return fmt.Errorf("error setting flag %s=%q: %w", name, value, err)
+		}
+		x := reflect.ValueOf(flag.Get(fl.Value))
+		obj.Field(i).Set(x)
+
+		// Validates the config again to ensure it's left in a consistent state.
+		return c.validate()
+	}
+	return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value)
+}
+
+func getVal(field reflect.Value) string {
+	if str, ok := field.Addr().Interface().(fmt.Stringer); ok {
+		return str.String()
+	}
+	switch field.Kind() {
+	case reflect.Bool:
+		return strconv.FormatBool(field.Bool())
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+		return strconv.FormatInt(field.Int(), 10)
+	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
+		return strconv.FormatUint(field.Uint(), 10)
+	case reflect.String:
+		return field.String()
+	default:
+		panic("unknown type " + field.Kind().String())
+	}
+}
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 64b23639a..dbb88e117 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,11 +24,11 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-// NewWithSocket creates pty master/slave pair, sends the master FD over the given
-// socket, and returns the slave.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the given
+// socket, and returns the replica.
 func NewWithSocket(socketPath string) (*os.File, error) {
-	// Create a new pty master and slave.
-	ptyMaster, ptySlave, err := pty.Open()
+	// Create a new pty master and replica.
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		return nil, fmt.Errorf("opening pty: %v", err)
 	}
@@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Get a connection to the socket path.
 	conn, err := net.Dial("unix", socketPath)
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
 	}
 	defer conn.Close()
 	uc, ok := conn.(*net.UnixConn)
 	if !ok {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
 	}
 	socket, err := uc.File()
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
 	}
 	defer socket.Close()
@@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Send the master FD over the connection.
 	msg := unix.UnixRights(int(ptyMaster.Fd()))
 	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
 	}
-	return ptySlave, nil
+	return ptyReplica, nil
 }
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 9a9ee7e2a..c33755482 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -23,6 +23,7 @@ go_library(
         "//pkg/sync",
         "//runsc/boot",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
@@ -65,6 +66,7 @@ go_test(
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 995d4e267..4228399b8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -185,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
-	// Create a pty master/slave. The slave will be passed to the exec
+	// Create a pty master/replica. The replica will be passed to the exec
 	// process.
-	ptyMaster, ptySlave, err := pty.Open()
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		t.Fatalf("error opening pty: %v", err)
 	}
 	defer ptyMaster.Close()
-	defer ptySlave.Close()
+	defer ptyReplica.Close()
 
 	// Exec bash and attach a terminal. Note that occasionally /bin/sh
 	// may be a different shell or have a different configuration (such
@@ -203,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) {
 		// Don't let bash execute from profile or rc files, otherwise
 		// our PID counts get messed up.
 		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
-		// Pass the pty slave as FD 0, 1, and 2.
+		// Pass the pty replica as FD 0, 1, and 2.
 		FilePayload: urpc.FilePayload{
-			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+			Files: []*os.File{ptyReplica, ptyReplica, ptyReplica},
 		},
 		StdioIsPty: true,
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 7ad09bf23..52e1755ce 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/sandbox"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -269,7 +270,7 @@ type Args struct {
 // New creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used. The caller must call
 // Destroy() on the container.
-func New(conf *boot.Config, args Args) (*Container, error) {
+func New(conf *config.Config, args Args) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir)
 	if err := validateID(args.ID); err != nil {
 		return nil, err
@@ -311,6 +312,14 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	if isRoot(args.Spec) {
 		log.Debugf("Creating new sandbox for container %q", args.ID)
 
+		if args.Spec.Linux == nil {
+			args.Spec.Linux = &specs.Linux{}
+		}
+		// Don't force the use of cgroups in tests because they lack permission to do so.
+		if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			args.Spec.Linux.CgroupsPath = "/" + args.ID
+		}
+
 		// Create and join cgroup before processes are created to ensure they are
 		// part of the cgroup from the start (and all their children processes).
 		cg, err := cgroup.New(args.Spec)
@@ -320,7 +329,13 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 		if cg != nil {
 			// If there is cgroup config, install it before creating sandbox process.
 			if err := cg.Install(args.Spec.Linux.Resources); err != nil {
-				return nil, fmt.Errorf("configuring cgroup: %v", err)
+				switch {
+				case errors.Is(err, syscall.EACCES) && conf.Rootless:
+					log.Warningf("Skipping cgroup configuration in rootless mode: %v", err)
+					cg = nil
+				default:
+					return nil, fmt.Errorf("configuring cgroup: %v", err)
+				}
 			}
 		}
 		if err := runInCgroup(cg, func() error {
@@ -397,7 +412,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 }
 
 // Start starts running the containerized process inside the sandbox.
-func (c *Container) Start(conf *boot.Config) error {
+func (c *Container) Start(conf *config.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
 	if err := c.Saver.lock(); err != nil {
@@ -472,7 +487,7 @@ func (c *Container) Start(conf *boot.Config) error {
 
 // Restore takes a container and replaces its kernel and file system
 // to restore a container from its state file.
-func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
 	if err := c.Saver.lock(); err != nil {
 		return err
@@ -499,7 +514,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 }
 
 // Run is a helper that calls Create + Start + Wait.
-func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
+func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir)
 	c, err := New(conf, args)
 	if err != nil {
@@ -861,7 +876,7 @@ func (c *Container) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
 	// Start with the general config flags.
 	args := conf.ToFlags()
 
@@ -901,9 +916,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	args = append(args, "gofer", "--bundle", bundleDir)
-	if conf.Overlay {
-		args = append(args, "--panic-on-write=true")
-	}
 
 	// Open the spec file to donate to the sandbox.
 	specFile, err := specutils.OpenSpec(bundleDir)
@@ -987,7 +999,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return nil, nil, fmt.Errorf("Gofer: %v", err)
+		return nil, nil, fmt.Errorf("gofer: %v", err)
 	}
 	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5e8247bc8..cc188f45b 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -41,8 +41,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -250,7 +251,7 @@ func readOutputNum(file string, position int) (int, error) {
 
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
-func run(spec *specs.Spec, conf *boot.Config) error {
+func run(spec *specs.Spec, conf *config.Config) error {
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
@@ -289,26 +290,24 @@ var (
 )
 
 // configs generates different configurations to run tests.
-func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
 	// Always load the default config.
-	cs := make(map[string]*boot.Config)
+	cs := make(map[string]*config.Config)
+	testutil.TestConfig(t)
 	for _, o := range opts {
+		c := testutil.TestConfig(t)
 		switch o {
 		case overlay:
-			c := testutil.TestConfig(t)
 			c.Overlay = true
 			cs["overlay"] = c
 		case ptrace:
-			c := testutil.TestConfig(t)
 			c.Platform = platforms.Ptrace
 			cs["ptrace"] = c
 		case kvm:
-			c := testutil.TestConfig(t)
 			c.Platform = platforms.KVM
 			cs["kvm"] = c
 		case nonExclusiveFS:
-			c := testutil.TestConfig(t)
-			c.FileAccess = boot.FileAccessShared
+			c.FileAccess = config.FileAccessShared
 			cs["non-exclusive"] = c
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
@@ -317,23 +316,14 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	return cs
 }
 
-func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
-	vfs1 := configs(t, opts...)
-
-	var optsVFS2 []configOption
-	for _, opt := range opts {
-		// TODO(gvisor.dev/issue/1487): Enable overlay tests.
-		if opt != overlay {
-			optsVFS2 = append(optsVFS2, opt)
-		}
-	}
-
-	for key, value := range configs(t, optsVFS2...) {
+// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default.
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
+	all := configs(t, opts...)
+	for key, value := range configs(t, opts...) {
 		value.VFS2 = true
-		vfs1[key+"VFS2"] = value
+		all[key+"VFS2"] = value
 	}
-
-	return vfs1
+	return all
 }
 
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
@@ -512,7 +502,7 @@ func TestExePath(t *testing.T) {
 		t.Fatalf("error making directory: %v", err)
 	}
 
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			for _, test := range []struct {
 				path    string
@@ -837,7 +827,7 @@ func TestExecProcList(t *testing.T) {
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			app, err := testutil.FindFile("test/cmd/test_app/test_app")
 			if err != nil {
@@ -905,13 +895,15 @@ func TestKillPid(t *testing.T) {
 	}
 }
 
-// TestCheckpointRestore creates a container that continuously writes successive integers
-// to a file. To test checkpoint and restore functionality, the container is
-// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
-// new containers and the first number printed from these containers is checked. Both should
-// be the next consecutive number after the last number from the checkpointed container.
+// TestCheckpointRestore creates a container that continuously writes successive
+// integers to a file. To test checkpoint and restore functionality, the
+// container is checkpointed and the last number printed to the file is
+// recorded. Then, it is restored in two new containers and the first number
+// printed from these containers is checked. Both should be the next consecutive
+// number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -1073,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1210,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for name, conf := range configs(t, noOverlay...) {
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
 			if err != nil {
@@ -1470,7 +1463,7 @@ func TestRunNonRoot(t *testing.T) {
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
 			if err != nil {
@@ -1490,6 +1483,8 @@ func TestMountNewDir(t *testing.T) {
 				Source:      srcDir,
 				Type:        "bind",
 			})
+			// Extra points for creating the mount with a readonly root.
+			spec.Root.Readonly = true
 
 			if err := run(spec, conf); err != nil {
 				t.Fatalf("error running sandbox: %v", err)
@@ -1499,17 +1494,17 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 			spec.Root.Readonly = true
+
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer cleanup()
 
-			// Create, start and wait for the container.
 			args := Args{
 				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
@@ -1524,12 +1519,82 @@ func TestReadonlyRoot(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			ws, err := c.Wait()
+			// Read mounts to check that root is readonly.
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '")
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("root mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("root not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", "/foo")
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("touch file in ro mount: %v", err)
 			}
 			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
+				t.Fatalf("wrong waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Read mounts to check that volume is readonly.
+			cmd := fmt.Sprintf("mount | grep ' %s '", dir)
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd)
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("volume not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("touch file in ro mount: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("wrong WaitStatus: %v", ws)
 			}
 		})
 	}
@@ -1616,54 +1681,6 @@ func TestUIDMap(t *testing.T) {
 	}
 }
 
-func TestReadonlyMount(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
-		t.Run(name, func(t *testing.T) {
-			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-			if err != nil {
-				t.Fatalf("ioutil.TempDir() failed: %v", err)
-			}
-			spec.Mounts = append(spec.Mounts, specs.Mount{
-				Destination: dir,
-				Source:      dir,
-				Type:        "bind",
-				Options:     []string{"ro"},
-			})
-			spec.Root.Readonly = false
-
-			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer cleanup()
-
-			// Create, start and wait for the container.
-			args := Args{
-				ID:        testutil.RandomContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-			}
-			c, err := New(conf, args)
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer c.Destroy()
-			if err := c.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-
-			ws, err := c.Wait()
-			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
-			}
-			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
-			}
-		})
-	}
-}
-
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
@@ -1830,8 +1847,9 @@ func TestUserLog(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	// sched_rr_get_interval = 148 - not implemented in gvisor.
-	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
+	// sched_rr_get_interval - not implemented in gvisor.
+	num := strconv.Itoa(syscall.SYS_SCHED_RR_GET_INTERVAL)
+	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall="+num)
 	conf := testutil.TestConfig(t)
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
@@ -2013,7 +2031,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
 			if err != nil {
@@ -2116,27 +2134,19 @@ func TestMountPropagation(t *testing.T) {
 
 	// Check that mount didn't propagate to private mount.
 	privFile := filepath.Join(priv, "mnt", "file")
-	execArgs := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "!", "-f", privFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 
 	// Check that mount propagated to slave mount.
 	slaveFile := filepath.Join(slave, "mnt", "file")
-	execArgs = &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", slaveFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 }
 
 func TestMountSymlink(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
 			if err != nil {
@@ -2196,11 +2206,7 @@ func TestMountSymlink(t *testing.T) {
 			// Check that symlink was resolved and mount was created where the symlink
 			// is pointing to.
 			file := path.Join(target, "file")
-			execArgs := &control.ExecArgs{
-				Filename: "/usr/bin/test",
-				Argv:     []string{"test", "-f", file},
-			}
-			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+			if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 {
 				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
 			}
 		})
@@ -2326,13 +2332,42 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
+	args := &control.ExecArgs{
+		Filename: name,
+		Argv:     append([]string{name}, arg...),
+	}
+	return cont.executeSync(args)
+}
+
+func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		return nil, 0, err
+	}
+	defer r.Close()
+
+	args := &control.ExecArgs{
+		Filename:    name,
+		Argv:        append([]string{name}, arg...),
+		FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}},
+	}
+	ws, err := cont.executeSync(args)
+	w.Close()
+	if err != nil {
+		return nil, 0, err
+	}
+	out, err := ioutil.ReadAll(r)
+	return out, ws, err
+}
+
 // executeSync synchronously executes a new process.
-func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
-	pid, err := cont.Execute(args)
+func (c *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
+	pid, err := c.Execute(args)
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e189648f4..850e80290 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -60,7 +61,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	return specs, ids
 }
 
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+func startContainers(conf *config.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
 	if len(conf.RootDir) == 0 {
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
@@ -168,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) {
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -213,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) {
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -479,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -579,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1251,8 +1252,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	//TODO(gvisor.dev/issue/1487): This is failing with VFS2.
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1359,7 +1359,7 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 }
 
 // Test that unsupported pod mounts options are ignored when matching master and
-// slave mounts.
+// replica mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
@@ -1517,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 
 	// Check that container isn't running anymore.
-	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-	if _, err := c.executeSync(args); err == nil {
+	if _, err := execute(c, "/bin/true"); err == nil {
 		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 	}
 
@@ -1533,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
 		}
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err != nil {
+		if _, err := execute(c, "/bin/true"); err != nil {
 			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
 		}
 	}
@@ -1556,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 
 	// Check that entire sandbox isn't running anymore.
 	for _, c := range containers {
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err == nil {
+		if _, err := execute(c, "/bin/true"); err == nil {
 			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 		}
 	}
@@ -1694,12 +1691,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 }
 
 // TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
-// for root containers, sub-containers, and execed processes.
+// for root containers, sub-containers, and exec'ed processes.
 func TestMultiContainerHomeEnvDir(t *testing.T) {
-	// TODO(gvisor.dev/issue/1487): VFSv2 configs failing.
 	// NOTE: Don't use overlay since we need changes to persist to the temp dir
 	// outside the sandbox.
-	for testName, conf := range configs(t, noOverlay...) {
+	for testName, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(testName, func(t *testing.T) {
 
 			rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1719,12 +1715,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 				homeDirs[name] = homeFile
 			}
 
-			// We will sleep in the root container in order to ensure that
-			// the root container doesn't terminate before sub containers can be
-			// created.
-			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
-			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
-			execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())}
+			// We will sleep in the root container in order to ensure that the root
+			//container doesn't terminate before sub containers can be created.
+			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())}
+			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())}
+			execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name())
 
 			// Setup the containers, a root container and sub container.
 			specConfig, ids := createSpecs(rootCmd, subCmd)
@@ -1735,9 +1730,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 			defer cleanup()
 
 			// Exec into the root container synchronously.
-			args := &control.ExecArgs{Argv: execCmd}
-			if _, err := containers[0].executeSync(args); err != nil {
-				t.Errorf("error executing %+v: %v", args, err)
+			if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil {
+				t.Errorf("error executing %+v: %v", execCmd, err)
 			}
 
 			// Wait for the subcontainer to finish.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index bac177a88..cb5bffb89 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -25,14 +25,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) {
 
 func checkFile(c *Container, filename string, want []byte) error {
 	cpy := filename + ".copy"
-	argsCp := &control.ExecArgs{
-		Filename: "/bin/cp",
-		Argv:     []string{"cp", "-f", filename, cpy},
-	}
-	if _, err := c.executeSync(argsCp); err != nil {
+	if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil {
 		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
 	}
 	got, err := ioutil.ReadFile(cpy)
@@ -189,7 +185,7 @@ func checkFile(c *Container, filename string, want []byte) error {
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) {
 	}
 
 	// Append to file inside the container and check that content is not lost.
-	argsAppend := &control.ExecArgs{
-		Filename: "/bin/bash",
-		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
-	}
-	if _, err := c.executeSync(argsAppend); err != nil {
+	if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil {
 		t.Fatalf("unexpected error appending file %q: %v", filename, err)
 	}
 	want = []byte("host-sandbox-")
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index 0ca4829d7..775325c06 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package flag wraps flag primitives.
 package flag
 
 import (
@@ -21,13 +22,19 @@ import (
 type FlagSet = flag.FlagSet
 
 var (
-	NewFlagSet  = flag.NewFlagSet
-	String      = flag.String
 	Bool        = flag.Bool
-	Int         = flag.Int
-	Uint        = flag.Uint
 	CommandLine = flag.CommandLine
+	Int         = flag.Int
+	NewFlagSet  = flag.NewFlagSet
 	Parse       = flag.Parse
+	String      = flag.String
+	Uint        = flag.Uint
+	Var         = flag.Var
 )
 
 const ContinueOnError = flag.ContinueOnError
+
+// Get returns the flag's underlying object.
+func Get(v flag.Value) interface{} {
+	return v.(flag.Getter).Get()
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 05e3637f7..96c57a426 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -32,5 +32,6 @@ go_test(
         "//pkg/log",
         "//pkg/p9",
         "//pkg/test/testutil",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 88814b83c..39b8a0b1e 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -27,62 +27,51 @@ import (
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_ACCEPT:        {},
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE:     {},
-	syscall.SYS_DUP:       {},
-	syscall.SYS_EPOLL_CTL: {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
+	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
 	syscall.SYS_EXIT_GROUP: {},
 	syscall.SYS_FALLOCATE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_FCHMOD:   {},
 	syscall.SYS_FCHOWNAT: {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 		// Used by flipcall.PacketWindowAllocator.Init().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(unix.F_ADD_SEALS),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(unix.F_ADD_SEALS),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -91,31 +80,31 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: {
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		// Non-private futex used for flipcall.
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETDENTS64:   {},
@@ -137,28 +126,28 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:   {},
@@ -172,14 +161,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READLINKAT: {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RENAMEAT:        {},
@@ -190,33 +179,33 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		// Used by fdchannel.Endpoint.SendFD().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		// Used by unet.SocketWriter.WriteVec().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK: {},
 	// Used by fdchannel.NewConnectedSockets().
 	syscall.SYS_SOCKETPAIR: {
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_SYMLINKAT: {},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UNLINKAT:  {},
@@ -227,24 +216,24 @@ var allowedSyscalls = seccomp.SyscallRules{
 var udsSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SOCKET: []seccomp.Rule{
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_STREAM),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_STREAM),
+			seccomp.EqualTo(0),
 		},
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_DGRAM),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_DGRAM),
+			seccomp.EqualTo(0),
 		},
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_SEQPACKET),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_SEQPACKET),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_CONNECT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
 		},
 	},
 }
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
index a4b28cb8b..686753d96 100644
--- a/runsc/fsgofer/filter/config_amd64.go
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -25,8 +25,41 @@ import (
 
 func init() {
 	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
 	}
 
 	allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
index d2697deb7..ff0cf77a0 100644
--- a/runsc/fsgofer/filter/config_arm64.go
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -23,5 +23,26 @@ import (
 )
 
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
+
 	allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
 }
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 885c92f7a..20a0732be 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -35,6 +35,7 @@ func instrumentationFilters() seccomp.SyscallRules {
 		syscall.SYS_MUNLOCK:         {},
 		syscall.SYS_NANOSLEEP:       {},
 		syscall.SYS_OPEN:            {},
+		syscall.SYS_OPENAT:          {},
 		syscall.SYS_SET_ROBUST_LIST: {},
 		// Used within glibc's malloc.
 		syscall.SYS_TIME: {},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 639de9ca1..0b628c8ce 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -45,7 +44,7 @@ const (
 	// modes to ensure an unopened/closed file fails all mode checks.
 	invalidMode = p9.OpenFlags(math.MaxUint32)
 
-	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+	openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
 
 	allowedOpenFlags = unix.O_TRUNC
 )
@@ -125,7 +124,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 }
 
 // makeQID returns a unique QID for the given stat buffer.
-func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
 	a.deviceMu.Lock()
 	defer a.deviceMu.Unlock()
 
@@ -156,9 +155,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'file' to be used with other
 // operations. The file is opened as readonly, unless it's a symlink or there is
-// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
-// called to clone the file. This reduces the number of walks that need to be
-// done by the host file system when files are reused.
+// no read access, which requires O_PATH.
 //
 // The file may be reopened if the requested mode in Open() is not a subset of
 // current mode. Consequently, 'file' could have a mode wider than requested and
@@ -170,11 +167,30 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
 // multiple files are only being opened for read (esp. startup).
+//
+// File operations must use "at" functions whenever possible:
+//   * Local operations must use AT_EMPTY_PATH:
+//  	   fchownat(fd, "", AT_EMPTY_PATH, ...), instead of chown(fullpath, ...)
+//   * Creation operations must use (fd + name):
+//       mkdirat(fd, name, ...), instead of mkdir(fullpath, ...)
+//
+// Apart from being faster, it also adds another layer of defense against
+// symlink attacks (note that O_NOFOLLOW applies only to the last element in
+// the path).
+//
+// The few exceptions where it cannot be done are: utimensat on symlinks, and
+// Connect() for the socket address.
 type localFile struct {
+	p9.DisallowClientCalls
+
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
-	// hostPath will be safely updated by the Renamed hook.
+	// hostPath is the full path to the host file. It can be used for logging and
+	// the few cases where full path is required to operation the host file. In
+	// all other cases, use "file" directly.
+	//
+	// Note: it's safely updated by the Renamed hook.
 	hostPath string
 
 	// file is opened when localFile is created and it's never nil. It may be
@@ -191,7 +207,7 @@ type localFile struct {
 	mode p9.OpenFlags
 
 	// fileType for this file. It is equivalent to:
-	// syscall.Stat_t.Mode & syscall.S_IFMT
+	// unix.Stat_t.Mode & unix.S_IFMT
 	fileType uint32
 
 	qid p9.QID
@@ -211,7 +227,7 @@ var procSelfFD *fd.FD
 // OpenProcSelfFD opens the /proc/self/fd directory, which will be used to
 // reopen file descriptors.
 func OpenProcSelfFD() error {
-	d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0)
+	d, err := unix.Open("/proc/self/fd", unix.O_RDONLY|unix.O_DIRECTORY, 0)
 	if err != nil {
 		return fmt.Errorf("error opening /proc/self/fd: %v", err)
 	}
@@ -220,7 +236,7 @@ func OpenProcSelfFD() error {
 }
 
 func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
-	d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0)
+	d, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^unix.O_NOFOLLOW, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -229,17 +245,17 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
 }
 
 func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
-	path := path.Join(parent.hostPath, name)
-	f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+	pathDebug := path.Join(parent.hostPath, name)
+	f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) {
 		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
 	})
-	return f, path, readable, err
+	return f, pathDebug, readable, err
 }
 
-// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// openAnyFile attempts to open the file in O_RDONLY. If it fails, falls back
 // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
 // actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
+func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
 	// Attempt to open file in the following mode in order:
 	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
 	//      Use non-blocking to prevent getting stuck inside open(2) for
@@ -250,7 +266,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool,
 		readable bool
 	}{
 		{
-			mode:     syscall.O_RDONLY | syscall.O_NONBLOCK,
+			mode:     unix.O_RDONLY | unix.O_NONBLOCK,
 			readable: true,
 		},
 		{
@@ -268,36 +284,36 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool,
 			return file, option.readable, nil
 		}
 		switch e := extractErrno(err); e {
-		case syscall.ENOENT:
+		case unix.ENOENT:
 			// File doesn't exist, no point in retrying.
 			return nil, false, e
 		}
 		// File failed to open. Try again with next mode, preserving 'err' in case
 		// this was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err)
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, pathDebug, err)
 	}
 	// All attempts to open file have failed, return the last error.
-	log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+	log.Debugf("Failed to open file, path: %q, err: %v", pathDebug, err)
 	return nil, false, extractErrno(err)
 }
 
-func checkSupportedFileType(stat syscall.Stat_t, permitSocket bool) error {
-	switch stat.Mode & syscall.S_IFMT {
-	case syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK:
+func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
+	switch stat.Mode & unix.S_IFMT {
+	case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK:
 		return nil
 
-	case syscall.S_IFSOCK:
+	case unix.S_IFSOCK:
 		if !permitSocket {
-			return syscall.EPERM
+			return unix.EPERM
 		}
 		return nil
 
 	default:
-		return syscall.EPERM
+		return unix.EPERM
 	}
 }
 
-func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) {
 	if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
 		return nil, err
 	}
@@ -307,7 +323,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat
 		hostPath:        path,
 		file:            file,
 		mode:            invalidMode,
-		fileType:        stat.Mode & syscall.S_IFMT,
+		fileType:        stat.Mode & unix.S_IFMT,
 		qid:             a.makeQID(stat),
 		controlReadable: readable,
 	}, nil
@@ -317,7 +333,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat
 // non-blocking. If anything fails, returns nil. It's better to have a file
 // without host FD, than to fail the operation.
 func newFDMaybe(file *fd.FD) *fd.FD {
-	dupFD, err := syscall.Dup(file.FD())
+	dupFD, err := unix.Dup(file.FD())
 	// Technically, the runtime may call the finalizer on file as soon as
 	// FD() returns.
 	runtime.KeepAlive(file)
@@ -327,31 +343,23 @@ func newFDMaybe(file *fd.FD) *fd.FD {
 	dup := fd.New(dupFD)
 
 	// fd is blocking; non-blocking is required.
-	if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+	if err := unix.SetNonblock(dup.FD(), true); err != nil {
 		_ = dup.Close()
 		return nil
 	}
 	return dup
 }
 
-func fstat(fd int) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Fstat(fd, &stat); err != nil {
-		return syscall.Stat_t{}, err
-	}
-	return stat, nil
-}
-
-func stat(path string) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Stat(path, &stat); err != nil {
-		return syscall.Stat_t{}, err
+func fstat(fd int) (unix.Stat_t, error) {
+	var stat unix.Stat_t
+	if err := unix.Fstat(fd, &stat); err != nil {
+		return unix.Stat_t{}, err
 	}
 	return stat, nil
 }
 
 func fchown(fd int, uid p9.UID, gid p9.GID) error {
-	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+	return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
 // Open implements p9.File.
@@ -377,7 +385,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
 		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
 		var err error
-		osFlags := flags.OSFlags() & (syscall.O_ACCMODE | allowedOpenFlags)
+		osFlags := flags.OSFlags() & (unix.O_ACCMODE | allowedOpenFlags)
 		newFile, err = reopenProcFd(l.file, openFlags|osFlags)
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
@@ -385,7 +393,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	}
 
 	var fd *fd.FD
-	if l.fileType == syscall.S_IFREG {
+	if l.fileType == unix.S_IFREG {
 		// Donate FD for regular files only.
 		fd = newFDMaybe(newFile)
 	}
@@ -408,7 +416,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	}
 
 	// Set file creation flags, plus allowed open flags from caller.
-	osFlags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	osFlags := openFlags | unix.O_CREAT | unix.O_EXCL
 	osFlags |= p9Flags.OSFlags() & allowedOpenFlags
 
 	// 'file' may be used for other operations (e.g. Walk), so read access is
@@ -416,9 +424,9 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	// than needed for each particular case.
 	mode := p9Flags & p9.OpenFlagsModeMask
 	if mode == p9.WriteOnly {
-		osFlags |= syscall.O_RDWR
+		osFlags |= unix.O_RDWR
 	} else {
-		osFlags |= mode.OSFlags() & unix.O_ACCMODE
+		osFlags |= mode.OSFlags()
 	}
 
 	child, err := fd.OpenAt(l.file, name, osFlags, uint32(perm.Permissions()))
@@ -428,7 +436,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	cu := cleanup.Make(func() {
 		_ = child.Close()
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -447,7 +455,7 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 		hostPath:    path.Join(l.hostPath, name),
 		file:        child,
 		mode:        mode,
-		fileType:    syscall.S_IFREG,
+		fileType:    unix.S_IFREG,
 		qid:         l.attachPoint.makeQID(stat),
 	}
 
@@ -461,7 +469,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, err
 	}
 
-	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+	if err := unix.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := cleanup.Make(func() {
@@ -473,7 +481,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	defer cu.Clean()
 
 	// Open directory to change ownership and stat it.
-	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	flags := unix.O_DIRECTORY | unix.O_RDONLY | openFlags
 	f, err := fd.OpenAt(l.file, name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
@@ -508,20 +516,20 @@ func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask,
 	return qids, file, mask, attr, nil
 }
 
-func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, error) {
+func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
 		newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
 			return reopenProcFd(l.file, openFlags|mode)
 		})
 		if err != nil {
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		stat, err := fstat(newFile.FD())
 		if err != nil {
 			_ = newFile.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		c := &localFile{
@@ -537,7 +545,7 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 	}
 
 	var qids []p9.QID
-	var lastStat syscall.Stat_t
+	var lastStat unix.Stat_t
 	last := l
 	for _, name := range names {
 		f, path, readable, err := openAnyFileFromParent(last, name)
@@ -545,17 +553,17 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 			_ = last.Close()
 		}
 		if err != nil {
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 		lastStat, err = fstat(f.FD())
 		if err != nil {
 			_ = f.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 		c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
 		if err != nil {
 			_ = f.Close()
-			return nil, nil, syscall.Stat_t{}, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		qids = append(qids, c.qid)
@@ -566,8 +574,8 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, err
 
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
-	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+	var s unix.Statfs_t
+	if err := unix.Fstatfs(l.file.FD(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -587,9 +595,9 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
-	if err := syscall.Fsync(l.file.FD()); err != nil {
+	if err := unix.Fsync(l.file.FD()); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -605,7 +613,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 	return l.qid, mask, attr, nil
 }
 
-func (l *localFile) fillAttr(stat syscall.Stat_t) (p9.AttrMask, p9.Attr) {
+func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) {
 	attr := p9.Attr{
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
@@ -665,13 +673,13 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
 		log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
-		return syscall.EPERM
+		return unix.EPERM
 	}
 
 	// Check if it's possible to use cached file, or if another one needs to be
 	// opened for write.
 	f := l.file
-	if l.fileType == syscall.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+	if l.fileType == unix.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		var err error
 		f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
 		if err != nil {
@@ -692,21 +700,21 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// over another.
 	var err error
 	if valid.Permissions {
-		if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+		if cerr := unix.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
 			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
 			err = extractErrno(cerr)
 		}
 	}
 
 	if valid.Size {
-		if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+		if terr := unix.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
 			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
 			err = extractErrno(terr)
 		}
 	}
 
 	if valid.ATime || valid.MTime {
-		utimes := [2]syscall.Timespec{
+		utimes := [2]unix.Timespec{
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 		}
@@ -727,15 +735,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 			}
 		}
 
-		if l.fileType == syscall.S_IFLNK {
+		if l.fileType == unix.S_IFLNK {
 			// utimensat operates different that other syscalls. To operate on a
 			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
 			// name.
-			parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
 			if err != nil {
 				return extractErrno(err)
 			}
-			defer syscall.Close(parent)
+			defer unix.Close(parent)
 
 			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
 				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
@@ -760,7 +768,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 		if valid.GID {
 			gid = int(attr.GID)
 		}
-		if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+		if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
 			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
 			err = extractErrno(oerr)
 		}
@@ -770,28 +778,28 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 func (*localFile) GetXattr(string, uint64) (string, error) {
-	return "", syscall.EOPNOTSUPP
+	return "", unix.EOPNOTSUPP
 }
 
 func (*localFile) SetXattr(string, string, uint32) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+	return nil, unix.EOPNOTSUPP
 }
 
 func (*localFile) RemoveXattr(string) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 // Allocate implements p9.File.
 func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
 
-	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+	if err := unix.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -818,10 +826,10 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 // ReadAt implements p9.File.
 func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	r, err := l.file.ReadAt(p, int64(offset))
@@ -836,10 +844,10 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 // WriteAt implements p9.File.
 func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	w, err := l.file.WriteAt(p, int64(offset))
@@ -860,7 +868,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	}
 	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), newName, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
@@ -899,34 +907,46 @@ func (l *localFile) Link(target p9.File, newName string) error {
 }
 
 // Mknod implements p9.File.
-func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	if err := l.checkROMount(); err != nil {
 		return p9.QID{}, err
 	}
 
-	hostPath := path.Join(l.hostPath, name)
-
-	// Return EEXIST if the file already exists.
-	if _, err := stat(hostPath); err == nil {
-		return p9.QID{}, syscall.EEXIST
-	}
-
 	// From mknod(2) man page:
 	// "EPERM: [...] if the filesystem containing pathname does not support
 	// the type of node requested."
 	if mode.FileType() != p9.ModeRegular {
-		return p9.QID{}, syscall.EPERM
+		return p9.QID{}, unix.EPERM
 	}
 
 	// Allow Mknod to create regular files.
-	if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil {
+	if err := unix.Mknodat(l.file.FD(), name, uint32(mode), 0); err != nil {
 		return p9.QID{}, err
 	}
+	cu := cleanup.Make(func() {
+		// Best effort attempt to remove the file in case of failure.
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
+
+	// Open file to change ownership and stat it.
+	child, err := fd.OpenAt(l.file, name, unix.O_PATH|openFlags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer child.Close()
 
-	stat, err := stat(hostPath)
+	if err := fchown(child.FD(), uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := fstat(child.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
+
+	cu.Release()
 	return l.attachPoint.makeQID(stat), nil
 }
 
@@ -945,10 +965,10 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 // Readdir implements p9.File.
 func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 	if !l.isOpen() {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 
 	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
@@ -965,7 +985,7 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	// which causes the directory stream to resynchronize with the directory's
 	// current contents).
 	if l.lastDirentOffset != offset || offset == 0 {
-		if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+		if _, err := unix.Seek(l.file.FD(), 0, 0); err != nil {
 			return nil, extractErrno(err)
 		}
 		skip = offset
@@ -998,7 +1018,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 
 	end := offset + uint64(count)
 	for offset < end {
-		dirSize, err := syscall.ReadDirent(f, direntsBuf)
+		dirSize, err := unix.ReadDirent(f, direntsBuf)
 		if err != nil {
 			return dirents, err
 		}
@@ -1007,7 +1027,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 		}
 
 		names := names[:0]
-		_, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+		_, _, names = unix.ParseDirent(direntsBuf[:dirSize], -1, names)
 
 		// Skip over entries that the caller is not interested in.
 		if skip > 0 {
@@ -1052,7 +1072,7 @@ func (l *localFile) Readlink() (string, error) {
 			return string(b[:n]), nil
 		}
 	}
-	return "", syscall.ENOMEM
+	return "", unix.ENOMEM
 }
 
 // Flush implements p9.File.
@@ -1063,7 +1083,7 @@ func (l *localFile) Flush() error {
 // Connect implements p9.File.
 func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	if !l.attachPoint.conf.HostUDS {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
@@ -1071,34 +1091,34 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	// fit f.path in our sockaddr. We'd need to redirect through a shorter
 	// path in order to actually connect to this socket.
 	if len(l.hostPath) > linux.UnixPathMax {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	var stype int
 	switch flags {
 	case p9.StreamSocket:
-		stype = syscall.SOCK_STREAM
+		stype = unix.SOCK_STREAM
 	case p9.DgramSocket:
-		stype = syscall.SOCK_DGRAM
+		stype = unix.SOCK_DGRAM
 	case p9.SeqpacketSocket:
-		stype = syscall.SOCK_SEQPACKET
+		stype = unix.SOCK_SEQPACKET
 	default:
-		return nil, syscall.ENXIO
+		return nil, unix.ENXIO
 	}
 
-	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	f, err := unix.Socket(unix.AF_UNIX, stype, 0)
 	if err != nil {
 		return nil, err
 	}
 
-	if err := syscall.SetNonblock(f, true); err != nil {
-		_ = syscall.Close(f)
+	if err := unix.SetNonblock(f, true); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
-	sa := syscall.SockaddrUnix{Name: l.hostPath}
-	if err := syscall.Connect(f, &sa); err != nil {
-		_ = syscall.Close(f)
+	sa := unix.SockaddrUnix{Name: l.hostPath}
+	if err := unix.Connect(f, &sa); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
@@ -1123,7 +1143,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) {
 }
 
 // extractErrno tries to determine the errno.
-func extractErrno(err error) syscall.Errno {
+func extractErrno(err error) unix.Errno {
 	if err == nil {
 		// This should never happen. The likely result will be that
 		// some user gets the frustrating "error: SUCCESS" message.
@@ -1133,18 +1153,18 @@ func extractErrno(err error) syscall.Errno {
 
 	switch err {
 	case os.ErrNotExist:
-		return syscall.ENOENT
+		return unix.ENOENT
 	case os.ErrExist:
-		return syscall.EEXIST
+		return unix.EEXIST
 	case os.ErrPermission:
-		return syscall.EACCES
+		return unix.EACCES
 	case os.ErrInvalid:
-		return syscall.EINVAL
+		return unix.EINVAL
 	}
 
 	// See if it's an errno or a common wrapped error.
 	switch e := err.(type) {
-	case syscall.Errno:
+	case unix.Errno:
 		return e
 	case *os.PathError:
 		return extractErrno(e.Err)
@@ -1156,15 +1176,12 @@ func extractErrno(err error) syscall.Errno {
 
 	// Fall back to EIO.
 	log.Debugf("Unknown error: %v, defaulting to EIO", err)
-	return syscall.EIO
+	return unix.EIO
 }
 
 func (l *localFile) checkROMount() error {
 	if conf := l.attachPoint.conf; conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return syscall.EROFS
+		return unix.EROFS
 	}
 	return nil
 }
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
index 5d4aab597..c46958185 100644
--- a/runsc/fsgofer/fsgofer_amd64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_NEWFSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
index 8041fd352..491460718 100644
--- a/runsc/fsgofer/fsgofer_arm64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_FSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_FSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 8ed703584..a84206686 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -21,9 +21,9 @@ import (
 	"os"
 	"path"
 	"path/filepath"
-	"syscall"
 	"testing"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/test/testutil"
@@ -32,7 +32,7 @@ import (
 var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
 
 var (
-	allTypes = []uint32{syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK}
+	allTypes = []uint32{unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK}
 
 	// allConfs is set in init().
 	allConfs []Config
@@ -52,8 +52,8 @@ func init() {
 	}
 }
 
-func configTestName(config *Config) string {
-	if config.ROMount {
+func configTestName(conf *Config) string {
+	if conf.ROMount {
 		return "ROMount"
 	}
 	return "RWMount"
@@ -83,7 +83,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 		}
 		want = append(want, b...)
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
@@ -101,7 +101,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 			return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
 		}
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
@@ -121,11 +121,11 @@ func (s state) String() string {
 
 func typeName(fileType uint32) string {
 	switch fileType {
-	case syscall.S_IFREG:
+	case unix.S_IFREG:
 		return "file"
-	case syscall.S_IFDIR:
+	case unix.S_IFDIR:
 		return "directory"
-	case syscall.S_IFLNK:
+	case unix.S_IFLNK:
 		return "symlink"
 	default:
 		panic(fmt.Sprintf("invalid file type for test: %d", fileType))
@@ -195,19 +195,19 @@ func setup(fileType uint32) (string, string, error) {
 
 	var name string
 	switch fileType {
-	case syscall.S_IFREG:
+	case unix.S_IFREG:
 		name = "file"
 		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 		if err != nil {
 			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
 		}
 		defer f.Close()
-	case syscall.S_IFDIR:
+	case unix.S_IFDIR:
 		name = "dir"
 		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
 		}
-	case syscall.S_IFLNK:
+	case unix.S_IFLNK:
 		name = "symlink"
 		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
@@ -227,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
 }
 
 func TestReadWrite(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -246,9 +246,13 @@ func TestReadWrite(t *testing.T) {
 			if err != nil {
 				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
 			}
-			if _, _, _, err := l.Open(flags); err != nil {
+			fd, _, _, err := l.Open(flags)
+			if err != nil {
 				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
 			}
+			if fd != nil {
+				defer fd.Close()
+			}
 			if err := testReadWrite(l, flags, want); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
@@ -257,14 +261,14 @@ func TestReadWrite(t *testing.T) {
 }
 
 func TestCreate(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		for i, flags := range allOpenFlags {
 			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 			if err != nil {
 				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
 			}
 
-			if err := testReadWrite(l, flags, []byte{}); err != nil {
+			if err := testReadWrite(l, flags, nil); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
 		}
@@ -274,7 +278,7 @@ func TestCreate(t *testing.T) {
 // TestReadWriteDup tests that a file opened in any mode can be dup'ed and
 // reopened in any other mode.
 func TestReadWriteDup(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -304,9 +308,13 @@ func TestReadWriteDup(t *testing.T) {
 					t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
 				}
 				defer dup.Close()
-				if _, _, _, err := dup.Open(dupFlags); err != nil {
+				fd, _, _, err := dup.Open(dupFlags)
+				if err != nil {
 					t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
 				}
+				if fd != nil {
+					defer fd.Close()
+				}
 				if err := testReadWrite(dup, dupFlags, want); err != nil {
 					t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
 				}
@@ -316,19 +324,19 @@ func TestReadWriteDup(t *testing.T) {
 }
 
 func TestUnopened(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFREG}, allConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) {
 		b := []byte("foobar")
-		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.WriteAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.ReadAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
-			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.Readdir(0, 100); err != unix.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if err := s.file.FSync(); err != syscall.EBADF {
-			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.FSync(); err != unix.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
 	})
 }
@@ -338,7 +346,7 @@ func TestUnopened(t *testing.T) {
 // was open with O_PATH, but Open() was not checking for it and allowing the
 // control file to be reused.
 func TestOpenOPath(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
 		// Fist remove all permissions on the file.
 		if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
 			t.Fatalf("SetAttr(): %v", err)
@@ -353,7 +361,7 @@ func TestOpenOPath(t *testing.T) {
 		if newFile.(*localFile).controlReadable {
 			t.Fatalf("control file didn't open with O_PATH: %+v", newFile)
 		}
-		if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES {
+		if _, _, _, err := newFile.Open(p9.ReadOnly); err != unix.EACCES {
 			t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err)
 		}
 	})
@@ -375,7 +383,7 @@ func TestSetAttrPerm(t *testing.T) {
 		valid := p9.SetAttrMask{Permissions: true}
 		attr := p9.SetAttr{Permissions: 0777}
 		got, err := SetGetAttr(s.file, valid, attr)
-		if s.fileType == syscall.S_IFLNK {
+		if s.fileType == unix.S_IFLNK {
 			if err == nil {
 				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 			}
@@ -396,7 +404,7 @@ func TestSetAttrSize(t *testing.T) {
 			valid := p9.SetAttrMask{Size: true}
 			attr := p9.SetAttr{Size: size}
 			got, err := SetGetAttr(s.file, valid, attr)
-			if s.fileType == syscall.S_IFLNK || s.fileType == syscall.S_IFDIR {
+			if s.fileType == unix.S_IFLNK || s.fileType == unix.S_IFDIR {
 				if err == nil {
 					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 				}
@@ -478,9 +486,9 @@ func TestLink(t *testing.T) {
 		}
 
 		err = dir.Link(s.file, linkFile)
-		if s.fileType == syscall.S_IFDIR {
-			if err != syscall.EPERM {
-				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+		if s.fileType == unix.S_IFDIR {
+			if err != unix.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: unix.EPERM", s, linkFile, err)
 			}
 			return
 		}
@@ -491,9 +499,12 @@ func TestLink(t *testing.T) {
 }
 
 func TestROMountChecks(t *testing.T) {
-	const want = syscall.EROFS
+	const want = unix.EROFS
+	uid := p9.UID(os.Getuid())
+	gid := p9.GID(os.Getgid())
+
 	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
-		if s.fileType != syscall.S_IFLNK {
+		if s.fileType != unix.S_IFLNK {
 			if _, _, _, err := s.file.Open(p9.WriteOnly); err != want {
 				t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
 			}
@@ -512,16 +523,16 @@ func TestROMountChecks(t *testing.T) {
 			}
 		}
 
-		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid); err != want {
 			t.Errorf("Create() should have failed, got: %v, expected: %v", err, want)
 		}
-		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, err := s.file.Mkdir("some_dir", 0777, uid, gid); err != want {
 			t.Errorf("MkDir() should have failed, got: %v, expected: %v", err, want)
 		}
 		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != want {
 			t.Errorf("Rename() should have failed, got: %v, expected: %v", err, want)
 		}
-		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != want {
+		if _, err := s.file.Symlink("some_place", "some_symlink", uid, gid); err != want {
 			t.Errorf("Symlink() should have failed, got: %v, expected: %v", err, want)
 		}
 		if err := s.file.UnlinkAt("some_file", 0); err != want {
@@ -530,6 +541,9 @@ func TestROMountChecks(t *testing.T) {
 		if err := s.file.Link(s.file, "some_link"); err != want {
 			t.Errorf("Link() should have failed, got: %v, expected: %v", err, want)
 		}
+		if _, err := s.file.Mknod("some-nod", 0777, 1, 2, uid, gid); err != want {
+			t.Errorf("Mknod() should have failed, got: %v, expected: %v", err, want)
+		}
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
@@ -539,29 +553,10 @@ func TestROMountChecks(t *testing.T) {
 	})
 }
 
-func TestROMountPanics(t *testing.T) {
-	conf := Config{ROMount: true, PanicOnWrite: true}
-	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
-		if s.fileType != syscall.S_IFLNK {
-			assertPanic(t, func() { s.file.Open(p9.WriteOnly) })
-		}
-		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
-		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
-		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
-
-		valid := p9.SetAttrMask{Size: true}
-		attr := p9.SetAttr{Size: 0}
-		assertPanic(t, func() { s.file.SetAttr(valid, attr) })
-	})
-}
-
 func TestWalkNotFound(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, allConfs, func(t *testing.T, s state) {
-		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
-			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err)
 		}
 	})
 }
@@ -580,7 +575,7 @@ func TestWalkDup(t *testing.T) {
 }
 
 func TestReaddir(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		name := "dir"
 		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
@@ -705,7 +700,7 @@ func TestAttachInvalidType(t *testing.T) {
 	defer os.RemoveAll(dir)
 
 	fifo := filepath.Join(dir, "fifo")
-	if err := syscall.Mkfifo(fifo, 0755); err != nil {
+	if err := unix.Mkfifo(fifo, 0755); err != nil {
 		t.Fatalf("Mkfifo(%q): %v", fifo, err)
 	}
 
@@ -766,16 +761,16 @@ func TestDoubleAttachError(t *testing.T) {
 }
 
 func TestTruncate(t *testing.T) {
-	runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
-			t.Fatalf("createFile() failed, err: %v", err)
+			t.Fatalf("createFile() failed: %v", err)
 		}
 		defer child.Close()
 		want := []byte("foobar")
 		w, err := child.WriteAt(want, 0)
 		if err != nil {
-			t.Fatalf("Write() failed, err: %v", err)
+			t.Fatalf("Write() failed: %v", err)
 		}
 		if w != len(want) {
 			t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(want))
@@ -783,12 +778,15 @@ func TestTruncate(t *testing.T) {
 
 		_, l, err := s.file.Walk([]string{"test"})
 		if err != nil {
-			t.Fatalf("Walk(%s) failed, err: %v", "test", err)
+			t.Fatalf("Walk(%s) failed: %v", "test", err)
 		}
 		if _, _, _, err := l.Open(p9.ReadOnly | p9.OpenTruncate); err != nil {
-			t.Fatalf("Open() failed, err: %v", err)
+			t.Fatalf("Open() failed: %v", err)
 		}
 		_, mask, attr, err := l.GetAttr(p9.AttrMask{Size: true})
+		if err != nil {
+			t.Fatalf("GetAttr() failed: %v", err)
+		}
 		if !mask.Size {
 			t.Fatalf("GetAttr() didn't return size: %+v", mask)
 		}
@@ -797,3 +795,27 @@ func TestTruncate(t *testing.T) {
 		}
 	})
 }
+
+func TestMknod(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+		_, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("Mknod() failed: %v", err)
+		}
+
+		_, f, err := s.file.Walk([]string{"test"})
+		if err != nil {
+			t.Fatalf("Walk() failed: %v", err)
+		}
+		fd, _, _, err := f.Open(p9.ReadWrite)
+		if err != nil {
+			t.Fatalf("Open() failed: %v", err)
+		}
+		if fd != nil {
+			defer fd.Close()
+		}
+		if err := testReadWrite(f, p9.ReadWrite, nil); err != nil {
+			t.Fatalf("testReadWrite() failed: %v", err)
+		}
+	})
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 542b54365..f11fea40d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -15,18 +15,18 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
 	var namePtr unsafe.Pointer
 	if name != "" {
-		nameBytes, err := syscall.BytePtrFromString(name)
+		nameBytes, err := unix.BytePtrFromString(name)
 		if err != nil {
 			return err
 		}
@@ -35,8 +35,8 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 
 	timesPtr := unsafe.Pointer(&times[0])
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_UTIMENSAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_UTIMENSAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(timesPtr),
@@ -52,7 +52,7 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
 	var oldNamePtr unsafe.Pointer
 	if oldName != "" {
-		nameBytes, err := syscall.BytePtrFromString(oldName)
+		nameBytes, err := unix.BytePtrFromString(oldName)
 		if err != nil {
 			return err
 		}
@@ -60,15 +60,15 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error
 	}
 	var newNamePtr unsafe.Pointer
 	if newName != "" {
-		nameBytes, err := syscall.BytePtrFromString(newName)
+		nameBytes, err := unix.BytePtrFromString(newName)
 		if err != nil {
 			return err
 		}
 		newNamePtr = unsafe.Pointer(nameBytes)
 	}
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_RENAMEAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_RENAMEAT,
 		uintptr(oldDirFD),
 		uintptr(oldNamePtr),
 		uintptr(newDirFD),
diff --git a/runsc/main.go b/runsc/main.go
index 69cb505fa..4ce5ebee9 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,363 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Binary runsc is an implementation of the Open Container Initiative Runtime
-// that runs applications inside a sandbox.
+// Binary runsc implements the OCI runtime interface.
 package main
 
 import (
-	"context"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"os"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"time"
-
-	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/cmd"
-	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/specutils"
-)
-
-var (
-	// Although these flags are not part of the OCI spec, they are used by
-	// Docker, and thus should not be changed.
-	rootDir     = flag.String("root", "", "root directory for storage of container state.")
-	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
-	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
-	debug       = flag.Bool("debug", false, "enable debug logging.")
-	showVersion = flag.Bool("version", false, "show version and exit.")
-	// TODO(gvisor.dev/issue/193): support systemd cgroups
-	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
-
-	// These flags are unique to runsc, and are used to configure parts of the
-	// system that are not covered by the runtime spec.
-
-	// Debugging flags.
-	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
-	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
-	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
-	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
-	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
-
-	// Debugging flags: strace related
-	strace         = flag.Bool("strace", false, "enable strace.")
-	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
-	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
-
-	// Flags that control sandbox runtime behavior.
-	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
-	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
-	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
-	txChecksumOffload  = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
-	rxChecksumOffload  = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
-	qDisc              = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
-	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
-	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
-	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
-	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
-	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
-	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
-	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
-	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
-	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
-	fuseEnabled        = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
-
-	// Test flags, not to be used outside tests, ever.
-	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
-	testOnlyTestNameEnv                        = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	"gvisor.dev/gvisor/runsc/cli"
 )
 
 func main() {
-	// Help and flags commands are generated automatically.
-	help := cmd.NewHelp(subcommands.DefaultCommander)
-	help.Register(new(cmd.Syscalls))
-	subcommands.Register(help, "")
-	subcommands.Register(subcommands.FlagsCommand(), "")
-
-	// Installation helpers.
-	const helperGroup = "helpers"
-	subcommands.Register(new(cmd.Install), helperGroup)
-	subcommands.Register(new(cmd.Uninstall), helperGroup)
-
-	// Register user-facing runsc commands.
-	subcommands.Register(new(cmd.Checkpoint), "")
-	subcommands.Register(new(cmd.Create), "")
-	subcommands.Register(new(cmd.Delete), "")
-	subcommands.Register(new(cmd.Do), "")
-	subcommands.Register(new(cmd.Events), "")
-	subcommands.Register(new(cmd.Exec), "")
-	subcommands.Register(new(cmd.Gofer), "")
-	subcommands.Register(new(cmd.Kill), "")
-	subcommands.Register(new(cmd.List), "")
-	subcommands.Register(new(cmd.Pause), "")
-	subcommands.Register(new(cmd.PS), "")
-	subcommands.Register(new(cmd.Restore), "")
-	subcommands.Register(new(cmd.Resume), "")
-	subcommands.Register(new(cmd.Run), "")
-	subcommands.Register(new(cmd.Spec), "")
-	subcommands.Register(new(cmd.State), "")
-	subcommands.Register(new(cmd.Start), "")
-	subcommands.Register(new(cmd.Wait), "")
-
-	// Register internal commands with the internal group name. This causes
-	// them to be sorted below the user-facing commands with empty group.
-	// The string below will be printed above the commands.
-	const internalGroup = "internal use only"
-	subcommands.Register(new(cmd.Boot), internalGroup)
-	subcommands.Register(new(cmd.Debug), internalGroup)
-	subcommands.Register(new(cmd.Gofer), internalGroup)
-	subcommands.Register(new(cmd.Statefile), internalGroup)
-
-	// All subcommands must be registered before flag parsing.
-	flag.Parse()
-
-	// Are we showing the version?
-	if *showVersion {
-		// The format here is the same as runc.
-		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
-		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
-		os.Exit(0)
-	}
-
-	// TODO(gvisor.dev/issue/193): support systemd cgroups
-	if *systemdCgroup {
-		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
-		os.Exit(1)
-	}
-
-	var errorLogger io.Writer
-	if *logFD > -1 {
-		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
-
-	} else if *logFilename != "" {
-		// We must set O_APPEND and not O_TRUNC because Docker passes
-		// the same log file for all commands (and also parses these
-		// log files), so we can't destroy them on each command.
-		var err error
-		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
-		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
-		}
-	}
-	cmd.ErrorLogger = errorLogger
-
-	platformType := *platformName
-	if _, err := platform.Lookup(platformType); err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if fsAccess == boot.FileAccessShared && *overlay {
-		cmd.Fatalf("overlay flag is incompatible with shared file access")
-	}
-
-	netType, err := boot.MakeNetworkType(*network)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	wa, err := boot.MakeWatchdogAction(*watchdogAction)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if *numNetworkChannels <= 0 {
-		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
-	}
-
-	refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
-	if err != nil {
-		cmd.Fatalf("%s", err)
-	}
-
-	// Sets the reference leak check mode. Also set it in config below to
-	// propagate it to child processes.
-	refs.SetLeakMode(refsLeakMode)
-
-	// Create a new Config from the flags.
-	conf := &boot.Config{
-		RootDir:            *rootDir,
-		Debug:              *debug,
-		LogFilename:        *logFilename,
-		LogFormat:          *logFormat,
-		DebugLog:           *debugLog,
-		PanicLog:           *panicLog,
-		DebugLogFormat:     *debugLogFormat,
-		FileAccess:         fsAccess,
-		FSGoferHostUDS:     *fsGoferHostUDS,
-		Overlay:            *overlay,
-		Network:            netType,
-		HardwareGSO:        *hardwareGSO,
-		SoftwareGSO:        *softwareGSO,
-		TXChecksumOffload:  *txChecksumOffload,
-		RXChecksumOffload:  *rxChecksumOffload,
-		LogPackets:         *logPackets,
-		Platform:           platformType,
-		Strace:             *strace,
-		StraceLogSize:      *straceLogSize,
-		WatchdogAction:     wa,
-		PanicSignal:        *panicSignal,
-		ProfileEnable:      *profile,
-		EnableRaw:          *netRaw,
-		NumNetworkChannels: *numNetworkChannels,
-		Rootless:           *rootless,
-		AlsoLogToStderr:    *alsoLogToStderr,
-		ReferenceLeakMode:  refsLeakMode,
-		OverlayfsStaleRead: *overlayfsStaleRead,
-		CPUNumFromQuota:    *cpuNumFromQuota,
-		VFS2:               *vfs2Enabled,
-		FUSE:               *fuseEnabled,
-		QDisc:              queueingDiscipline,
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
-		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
-	}
-	if len(*straceSyscalls) != 0 {
-		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-	}
-
-	// Set up logging.
-	if *debug {
-		log.SetLevel(log.Debug)
-	}
-
-	// Logging will include the local date and time via the time package.
-	//
-	// On first use, time.Local initializes the local time zone, which
-	// involves opening tzdata files on the host. Since this requires
-	// opening host files, it must be done before syscall filter
-	// installation.
-	//
-	// Generally there will be a log message before filter installation
-	// that will force initialization, but force initialization here in
-	// case that does not occur.
-	_ = time.Local.String()
-
-	subcommand := flag.CommandLine.Arg(0)
-
-	var e log.Emitter
-	if *debugLogFD > -1 {
-		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
-
-		e = newEmitter(*debugLogFormat, f)
-
-	} else if *debugLog != "" {
-		f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
-		if err != nil {
-			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
-		}
-		e = newEmitter(*debugLogFormat, f)
-
-	} else {
-		// Stderr is reserved for the application, just discard the logs if no debug
-		// log is specified.
-		e = newEmitter("text", ioutil.Discard)
-	}
-
-	if *panicLogFD > -1 || *debugLogFD > -1 {
-		fd := *panicLogFD
-		if fd < 0 {
-			fd = *debugLogFD
-		}
-		// Quick sanity check to make sure no other commands get passed
-		// a log fd (they should use log dir instead).
-		if subcommand != "boot" && subcommand != "gofer" {
-			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
-		}
-
-		// If we are the boot process, then we own our stdio FDs and can do what we
-		// want with them. Since Docker and Containerd both eat boot's stderr, we
-		// dup our stderr to the provided log FD so that panics will appear in the
-		// logs, rather than just disappear.
-		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
-		}
-	} else if *alsoLogToStderr {
-		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
-	}
-
-	log.SetTarget(e)
-
-	log.Infof("***************************")
-	log.Infof("Args: %s", os.Args)
-	log.Infof("Version %s", version)
-	log.Infof("PID: %d", os.Getpid())
-	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
-	log.Infof("Configuration:")
-	log.Infof("\t\tRootDir: %s", conf.RootDir)
-	log.Infof("\t\tPlatform: %v", conf.Platform)
-	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
-	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
-	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
-	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
-	log.Infof("***************************")
-
-	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
-		// SIGTERM is sent to all processes if a test exceeds its
-		// timeout and this case is handled by syscall_test_runner.
-		log.Warningf("Block the TERM signal. This is only safe in tests!")
-		signal.Ignore(syscall.SIGTERM)
-	}
-
-	// Call the subcommand and pass in the configuration.
-	var ws syscall.WaitStatus
-	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
-	if subcmdCode == subcommands.ExitSuccess {
-		log.Infof("Exiting with status: %v", ws)
-		if ws.Signaled() {
-			// No good way to return it, emulate what the shell does. Maybe raise
-			// signal to self?
-			os.Exit(128 + int(ws.Signal()))
-		}
-		os.Exit(ws.ExitStatus())
-	}
-	// Return an error that is unlikely to be used by the application.
-	log.Warningf("Failure to execute command, err: %v", subcmdCode)
-	os.Exit(128)
-}
-
-func newEmitter(format string, logFile io.Writer) log.Emitter {
-	switch format {
-	case "text":
-		return log.GoogleEmitter{&log.Writer{Next: logFile}}
-	case "json":
-		return log.JSONEmitter{&log.Writer{Next: logFile}}
-	case "json-k8s":
-		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
-	}
-	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
-	panic("unreachable")
-}
-
-func init() {
-	// Set default root dir to something (hopefully) user-writeable.
-	*rootDir = "/var/run/runsc"
-	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
-		*rootDir = filepath.Join(runtimeDir, "runsc")
-	}
+	cli.Main(version)
 }
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 2b9d4549d..f0a551a1e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 817a923ad..8f66dd1f8 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -49,26 +50,26 @@ import (
 //
 // Run the following container to test it:
 //  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
-func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
 	log.Infof("Setting up network")
 
 	switch conf.Network {
-	case boot.NetworkNone:
+	case config.NetworkNone:
 		log.Infof("Network is disabled, create loopback interface only")
 		if err := createDefaultLoopbackInterface(conn); err != nil {
 			return fmt.Errorf("creating default loopback interface: %v", err)
 		}
-	case boot.NetworkSandbox:
+	case config.NetworkSandbox:
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
 		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
-	case boot.NetworkHost:
+	case config.NetworkHost:
 		// Nothing to do here.
 	default:
-		return fmt.Errorf("invalid network type: %d", conf.Network)
+		return fmt.Errorf("invalid network type: %v", conf.Network)
 	}
 	return nil
 }
@@ -115,7 +116,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -308,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	const bufSize = 4 << 20 // 4MB.
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+
+		if sz < bufSize {
+			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+		if sz < bufSize {
+			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 36bb0c9c9..c4309feb3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -71,11 +72,14 @@ type Sandbox struct {
 	// will have it as a child process.
 	child bool
 
-	// status is an exit status of a sandbox process.
-	status syscall.WaitStatus
-
 	// statusMu protects status.
 	statusMu sync.Mutex
+
+	// status is the exit status of a sandbox process. It's only set if the
+	// child==true and the sandbox was waited on. This field allows for multiple
+	// threads to wait on sandbox and get the exit code, since Linux will return
+	// WaitStatus to one of the waiters only.
+	status syscall.WaitStatus
 }
 
 // Args is used to configure a new sandbox.
@@ -116,7 +120,7 @@ type Args struct {
 
 // New creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
-func New(conf *boot.Config, args *Args) (*Sandbox, error) {
+func New(conf *config.Config, args *Args) (*Sandbox, error) {
 	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
@@ -180,7 +184,7 @@ func (s *Sandbox) CreateContainer(cid string) error {
 }
 
 // StartRoot starts running the root container process inside the sandbox.
-func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
 	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -203,7 +207,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
 	for _, f := range goferFiles {
 		defer f.Close()
 	}
@@ -232,7 +236,7 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string
 }
 
 // Restore sends the restore call for a container in the sandbox.
-func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error {
 	log.Debugf("Restore sandbox %q", s.ID)
 
 	rf, err := os.Open(filename)
@@ -344,7 +348,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error {
+func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -477,10 +481,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	cmd.Stderr = nil
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
+	// pty master/replica pair and set the TTY on the sandbox process.
 	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
 		// console.NewWithSocket will send the master on the given
-		// socket, and return the slave.
+		// socket, and return the replica.
 		tty, err := console.NewWithSocket(args.ConsoleSocket)
 		if err != nil {
 			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
@@ -555,10 +559,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
-	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone {
+	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
-	} else if conf.Network == boot.NetworkHost {
+	} else if conf.Network == config.NetworkHost {
 		log.Infof("Sandbox will be started in the host network namespace")
 	} else {
 		log.Infof("Sandbox will be started in new network namespace")
@@ -568,7 +572,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// User namespace depends on the network type. Host network requires to run
 	// inside the user namespace specified in the spec or the current namespace
 	// if none is configured.
-	if conf.Network == boot.NetworkHost {
+	if conf.Network == config.NetworkHost {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
@@ -745,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
-	var ws syscall.WaitStatus
 
 	if conn, err := s.sandboxConnect(); err != nil {
-		// The sandbox may have exited while before we had a chance to
-		// wait on it.
+		// The sandbox may have exited while before we had a chance to wait on it.
+		// There is nothing we can do for subcontainers. For the init container, we
+		// can try to get the sandbox exit code.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
 		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	} else {
 		defer conn.Close()
+
 		// Try the Wait RPC to the sandbox.
+		var ws syscall.WaitStatus
 		err = conn.Call(boot.ContainerWait, &cid, &ws)
 		if err == nil {
 			// It worked!
 			return ws, nil
 		}
+		// See comment above.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
+
 		// The sandbox may have exited after we connected, but before
 		// or during the Wait RPC.
 		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	}
 
-	// The sandbox may have already exited, or exited while handling the
-	// Wait RPC. The best we can do is ask Linux what the sandbox exit
-	// status was, since in most cases that will be the same as the
-	// container exit status.
+	// The sandbox may have already exited, or exited while handling the Wait RPC.
+	// The best we can do is ask Linux what the sandbox exit status was, since in
+	// most cases that will be the same as the container exit status.
 	if err := s.waitForStopped(); err != nil {
-		return ws, err
+		return syscall.WaitStatus(0), err
 	}
 	if !s.child {
-		return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+		return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
 	}
+
+	s.statusMu.Lock()
+	defer s.statusMu.Unlock()
 	return s.status, nil
 }
 
@@ -1179,7 +1195,7 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 
 // checkBinaryPermissions verifies that the required binary bits are set on
 // the runsc executable.
-func checkBinaryPermissions(conf *boot.Config) error {
+func checkBinaryPermissions(conf *config.Config) error {
 	// All platforms need the other exe bit
 	neededBits := os.FileMode(0001)
 	if conf.Platform == platforms.Ptrace {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 43851a22f..679d8bc8e 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/kernel/auth",
+        "//runsc/config",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_mohae_deepcopy//:go_default_library",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD
new file mode 100644
index 000000000..3520f2d6d
--- /dev/null
+++ b/runsc/specutils/seccomp/BUILD
@@ -0,0 +1,34 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "seccomp",
+    srcs = [
+        "audit_amd64.go",
+        "audit_arm64.go",
+        "seccomp.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/syscalls/linux",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "seccomp_test",
+    size = "small",
+    srcs = ["seccomp_test.go"],
+    library = ":seccomp",
+    deps = [
+        "//pkg/binary",
+        "//pkg/bpf",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go
new file mode 100644
index 000000000..417cf4a7a
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package seccomp
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+	nativeArchAuditNo = linux.AUDIT_ARCH_X86_64
+)
diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go
new file mode 100644
index 000000000..b727ceff2
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package seccomp
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+	nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64
+)
diff --git a/runsc/specutils/seccomp/seccomp.go b/runsc/specutils/seccomp/seccomp.go
new file mode 100644
index 000000000..5932f7a41
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package seccomp implements some features of libseccomp in order to support
+// OCI.
+package seccomp
+
+import (
+	"fmt"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+var (
+	killThreadAction = linux.SECCOMP_RET_KILL_THREAD
+	trapAction       = linux.SECCOMP_RET_TRAP
+	// runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO
+	errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(syscall.EPERM))
+	// runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE
+	traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(syscall.EPERM))
+	allowAction = linux.SECCOMP_RET_ALLOW
+)
+
+// BuildProgram generates a bpf program based on the given OCI seccomp
+// config.
+func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) {
+	defaultAction, err := convertAction(s.DefaultAction)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("secomp default action: %w", err)
+	}
+	ruleset, err := convertRules(s)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err)
+	}
+
+	instrs, err := seccomp.BuildProgram(ruleset, defaultAction, killThreadAction)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err)
+	}
+
+	program, err := bpf.Compile(instrs)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err)
+	}
+
+	return program, nil
+}
+
+// lookupSyscallNo gets the syscall number for the syscall with the given name
+// for the given architecture.
+func lookupSyscallNo(arch uint32, name string) (uint32, error) {
+	var table *kernel.SyscallTable
+	switch arch {
+	case linux.AUDIT_ARCH_X86_64:
+		table = slinux.AMD64
+	case linux.AUDIT_ARCH_AARCH64:
+		table = slinux.ARM64
+	}
+	if table == nil {
+		return 0, fmt.Errorf("unsupported architecture: %d", arch)
+	}
+	n, err := table.LookupNo(name)
+	if err != nil {
+		return 0, err
+	}
+	return uint32(n), nil
+}
+
+// convertAction converts a LinuxSeccompAction to BPFAction
+func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) {
+	// TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess.
+	switch act {
+	case specs.ActKill:
+		return killThreadAction, nil
+	case specs.ActTrap:
+		return trapAction, nil
+	case specs.ActErrno:
+		return errnoAction, nil
+	case specs.ActTrace:
+		return traceAction, nil
+	case specs.ActAllow:
+		return allowAction, nil
+	default:
+		return 0, fmt.Errorf("invalid action: %v", act)
+	}
+}
+
+// convertRules converts OCI linux seccomp rules into RuleSets that can be used by
+// the seccomp package to build a seccomp program.
+func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) {
+	// NOTE: Architectures are only really relevant when calling 32bit syscalls
+	// on a 64bit system. Since we don't support that in gVisor anyway, we
+	// ignore Architectures and only test against the native architecture.
+
+	ruleset := []seccomp.RuleSet{}
+
+	for _, syscall := range s.Syscalls {
+		sysRules := seccomp.NewSyscallRules()
+
+		action, err := convertAction(syscall.Action)
+		if err != nil {
+			return nil, err
+		}
+
+		// Args
+		rules, err := convertArgs(syscall.Args)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, name := range syscall.Names {
+			syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name)
+			if err != nil {
+				// If there is an error looking up the syscall number, assume it is
+				// not supported on this architecture and ignore it. This is, for
+				// better or worse, what runc does.
+				log.Warningf("OCI seccomp: ignoring syscall %q", name)
+				continue
+			}
+
+			for _, rule := range rules {
+				sysRules.AddRule(uintptr(syscallNo), rule)
+			}
+		}
+
+		ruleset = append(ruleset, seccomp.RuleSet{
+			Rules:  sysRules,
+			Action: action,
+		})
+	}
+
+	return ruleset, nil
+}
+
+// convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule.
+func convertArgs(args []specs.LinuxSeccompArg) ([]seccomp.Rule, error) {
+	argCounts := make([]uint, 6)
+
+	for _, arg := range args {
+		if arg.Index > 6 {
+			return nil, fmt.Errorf("invalid index: %d", arg.Index)
+		}
+
+		argCounts[arg.Index]++
+	}
+
+	// NOTE: If multiple rules apply to the same argument (same index) the
+	// action is triggered if any one of the rules matches (OR). If not, then
+	// all rules much match in order to trigger the action (AND). This appears to
+	// be some kind of legacy behavior of runc that nevertheless needs to be
+	// supported to maintain compatibility.
+
+	hasMultipleArgs := false
+	for _, count := range argCounts {
+		if count > 1 {
+			hasMultipleArgs = true
+			break
+		}
+	}
+
+	if hasMultipleArgs {
+		rules := []seccomp.Rule{}
+
+		// Old runc behavior - do this for compatibility.
+		// Add rules as ORs by adding separate Rules.
+		for _, arg := range args {
+			rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+
+			if err := convertRule(arg, &rule); err != nil {
+				return nil, err
+			}
+
+			rules = append(rules, rule)
+		}
+
+		return rules, nil
+	}
+
+	// Add rules as ANDs by adding to the same Rule.
+	rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+	for _, arg := range args {
+		if err := convertRule(arg, &rule); err != nil {
+			return nil, err
+		}
+	}
+
+	return []seccomp.Rule{rule}, nil
+}
+
+// convertRule converts and adds the arg to a rule.
+func convertRule(arg specs.LinuxSeccompArg, rule *seccomp.Rule) error {
+	switch arg.Op {
+	case specs.OpEqualTo:
+		rule[arg.Index] = seccomp.EqualTo(arg.Value)
+	case specs.OpNotEqual:
+		rule[arg.Index] = seccomp.NotEqual(arg.Value)
+	case specs.OpGreaterThan:
+		rule[arg.Index] = seccomp.GreaterThan(arg.Value)
+	case specs.OpGreaterEqual:
+		rule[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value)
+	case specs.OpLessThan:
+		rule[arg.Index] = seccomp.LessThan(arg.Value)
+	case specs.OpLessEqual:
+		rule[arg.Index] = seccomp.LessThanOrEqual(arg.Value)
+	case specs.OpMaskedEqual:
+		rule[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo))
+	default:
+		return fmt.Errorf("unsupported operand: %q", arg.Op)
+	}
+	return nil
+}
diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go
new file mode 100644
index 000000000..850c237ba
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seccomp
+
+import (
+	"fmt"
+	"syscall"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/bpf"
+)
+
+type seccompData struct {
+	nr                 uint32
+	arch               uint32
+	instructionPointer uint64
+	args               [6]uint64
+}
+
+// asInput converts a seccompData to a bpf.Input.
+func asInput(d seccompData) bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+}
+
+// testInput creates an Input struct with given seccomp input values.
+func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input {
+	syscallNo, err := lookupSyscallNo(arch, syscallName)
+	if err != nil {
+		// Assume tests set valid syscall names.
+		panic(err)
+	}
+
+	if args == nil {
+		argArray := [6]uint64{0, 0, 0, 0, 0, 0}
+		args = &argArray
+	}
+
+	data := seccompData{
+		nr:   syscallNo,
+		arch: arch,
+		args: *args,
+	}
+
+	return asInput(data)
+}
+
+// testCase holds a seccomp test case.
+type testCase struct {
+	name     string
+	config   specs.LinuxSeccomp
+	input    bpf.Input
+	expected uint32
+}
+
+var (
+	// seccompTests is a list of speccomp test cases.
+	seccompTests = []testCase{
+		{
+			name: "default_allow",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+			},
+			input:    testInput(nativeArchAuditNo, "read", nil),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "default_deny",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActErrno,
+			},
+			input:    testInput(nativeArchAuditNo, "read", nil),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "deny_arch",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			// Syscall matches but the arch is AUDIT_ARCH_X86 so the return
+			// value is the bad arch action.
+			input:    asInput(seccompData{nr: 183, arch: 0x40000003}), //
+			expected: uint32(killThreadAction),
+		},
+		{
+			name: "match_name_errno",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getcwd", nil),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_name_trace",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "write", nil),
+			expected: uint32(traceAction),
+		},
+		{
+			name: "no_match_name_allow",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "openat", nil),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "simple_match_args",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_args_or",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 0,
+								Value: syscall.CLONE_VM,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_args_and",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getsockopt",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 1,
+								Value: syscall.SOL_SOCKET,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 2,
+								Value: syscall.SO_PEERCRED,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET, syscall.SO_PEERCRED}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "no_match_args_and",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getsockopt",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 1,
+								Value: syscall.SOL_SOCKET,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 2,
+								Value: syscall.SO_PEERCRED,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "Simple args (no match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_VM}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "OpMaskedEqual (match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    syscall.CLONE_FS,
+								ValueTwo: syscall.CLONE_FS,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS | syscall.CLONE_VM}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "OpMaskedEqual (no match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    syscall.CLONE_FS | syscall.CLONE_VM,
+								ValueTwo: syscall.CLONE_FS | syscall.CLONE_VM,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "OpMaskedEqual (clone)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActErrno,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						// This comes from the Docker default seccomp
+						// profile for clone.
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    0x7e020000,
+								ValueTwo: 0x0,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActAllow,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{0x50f00}),
+			expected: uint32(allowAction),
+		},
+	}
+)
+
+// TestRunscSeccomp generates seccomp programs from OCI config and executes
+// them using runsc's library, comparing against expected results.
+func TestRunscSeccomp(t *testing.T) {
+	for _, tc := range seccompTests {
+		t.Run(tc.name, func(t *testing.T) {
+			runscProgram, err := BuildProgram(&tc.config)
+			if err != nil {
+				t.Fatalf("generating runsc BPF: %v", err)
+			}
+
+			if err := checkProgram(runscProgram, tc.input, tc.expected); err != nil {
+				t.Fatalf("running runsc BPF: %v", err)
+			}
+		})
+	}
+}
+
+// checkProgram runs the given program over the given input and checks the
+// result against the expected output.
+func checkProgram(p bpf.Program, in bpf.Input, expected uint32) error {
+	result, err := bpf.Exec(p, in)
+	if err != nil {
+		return err
+	}
+
+	if result != expected {
+		// Include a decoded version of the program in output for debugging purposes.
+		decoded, _ := bpf.DecodeProgram(p)
+		return fmt.Errorf("Unexpected result: got: %d, expected: %d\nBPF Program\n%s", result, expected, decoded)
+	}
+
+	return nil
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 88b97f139..45abc1425 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 // ExePath must point to runsc binary, which is normally the same binary. It's
@@ -110,11 +111,6 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
 	}
 
-	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
-	if spec.Linux != nil && spec.Linux.Seccomp != nil {
-		log.Warningf("Seccomp spec is being ignored")
-	}
-
 	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
 		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
 			return err
@@ -161,18 +157,18 @@ func OpenSpec(bundleDir string) (*os.File, error) {
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
 // ReadSpec also normalizes all potential relative paths into absolute
 // path, e.g. spec.Root.Path, mount.Source.
-func ReadSpec(bundleDir string) (*specs.Spec, error) {
+func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) {
 	specFile, err := OpenSpec(bundleDir)
 	if err != nil {
 		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
 	}
 	defer specFile.Close()
-	return ReadSpecFromFile(bundleDir, specFile)
+	return ReadSpecFromFile(bundleDir, specFile, conf)
 }
 
 // ReadSpecFromFile reads an OCI runtime spec from the given File, and
 // normalizes all relative paths into absolute by prepending the bundle dir.
-func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) {
 	if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
 		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
 	}
@@ -195,6 +191,20 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 			m.Source = absPath(bundleDir, m.Source)
 		}
 	}
+
+	// Override flags using annotation to allow customization per sandbox
+	// instance.
+	for annotation, val := range spec.Annotations {
+		const flagPrefix = "dev.gvisor.flag."
+		if strings.HasPrefix(annotation, flagPrefix) {
+			name := annotation[len(flagPrefix):]
+			log.Infof("Overriding flag: %s=%q", name, val)
+			if err := conf.Override(name, val); err != nil {
+				return nil, err
+			}
+		}
+	}
+
 	return &spec, nil
 }
 
@@ -409,7 +419,7 @@ func Mount(src, dst, typ string, flags uint32) error {
 		// Special case, as there is no source directory for proc mounts.
 		isDir = true
 	} else if fi, err := os.Stat(src); err != nil {
-		return fmt.Errorf("Stat(%q) failed: %v", src, err)
+		return fmt.Errorf("stat(%q) failed: %v", src, err)
 	} else {
 		isDir = fi.IsDir()
 	}
@@ -417,25 +427,25 @@ func Mount(src, dst, typ string, flags uint32) error {
 	if isDir {
 		// Create the destination directory.
 		if err := os.MkdirAll(dst, 0777); err != nil {
-			return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+			return fmt.Errorf("mkdir(%q) failed: %v", dst, err)
 		}
 	} else {
 		// Create the parent destination directory.
 		parent := path.Dir(dst)
 		if err := os.MkdirAll(parent, 0777); err != nil {
-			return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+			return fmt.Errorf("mkdir(%q) failed: %v", parent, err)
 		}
 		// Create the destination file if it does not exist.
 		f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
 		if err != nil {
-			return fmt.Errorf("Open(%q) failed: %v", dst, err)
+			return fmt.Errorf("open(%q) failed: %v", dst, err)
 		}
 		f.Close()
 	}
 
 	// Do the mount.
 	if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
-		return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+		return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err)
 	}
 	return nil
 }
diff --git a/scripts/common.sh b/scripts/common.sh
deleted file mode 100755
index 3ca699e4a..000000000
--- a/scripts/common.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeou pipefail
-
-# Get the path to the directory this script lives in.
-# If this script is being called with `source`, $0 will be the path of the
-# *sourcing* script, so we can't use `dirname $0` to find scripts in this
-# directory.
-if [[ -v BASH_SOURCE && "$0" != "$BASH_SOURCE" ]]; then
-  declare -r script_dir="$(dirname "$BASH_SOURCE")"
-else
-  declare -r script_dir="$(dirname "$0")"
-fi
-
-source "${script_dir}/common_build.sh"
-
-# Ensure it attempts to collect logs in all cases.
-trap collect_logs EXIT
-
-function set_runtime() {
-  RUNTIME=${1:-runsc}
-  RUNSC_BIN=/tmp/"${RUNTIME}"/runsc
-  RUNSC_LOGS_DIR="$(dirname ${RUNSC_BIN})"/logs
-  RUNSC_LOGS="${RUNSC_LOGS_DIR}"/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%
-}
-
-function test_runsc() {
-  test --test_arg=--runtime=${RUNTIME} "$@"
-}
-
-function install_runsc_for_test() {
-  local -r test_name=$1
-  shift
-  if [[ -z "${test_name}" ]]; then
-    echo "Missing mandatory test name"
-    exit 1
-  fi
-
-  # Add test to the name, so it doesn't conflict with other runtimes.
-  set_runtime $(find_branch_name)_"${test_name}"
-
-  # ${RUNSC_TEST_NAME} is set by tests (see dockerutil) to pass the test name
-  # down to the runtime.
-  install_runsc "${RUNTIME}" \
-      --TESTONLY-test-name-env=RUNSC_TEST_NAME \
-      --debug \
-      --strace \
-      --log-packets \
-      "$@"
-}
-
-# Installs the runsc with given runtime name. set_runtime must have been called
-# to set runtime and logs location.
-function install_runsc() {
-  local -r runtime=$1
-  shift
-
-  # Prepare the runtime binary.
-  local -r output=$(build //runsc)
-  mkdir -p "$(dirname ${RUNSC_BIN})"
-  cp -f "${output}" "${RUNSC_BIN}"
-  chmod 0755 "${RUNSC_BIN}"
-
-  # Install the runtime.
-  sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@"
-
-  # Clear old logs files that may exist.
-  sudo rm -f "${RUNSC_LOGS_DIR}"/'*'
-
-  # Restart docker to pick up the new runtime configuration.
-  sudo systemctl restart docker
-}
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
deleted file mode 100755
index d4a6c4908..000000000
--- a/scripts/common_build.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-which bazel
-bazel version
-
-# Switch into the workspace; only necessary if run with kokoro.
-if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
-  cd git/repo
-elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
-  cd github/repo
-fi
-
-# Set the standard bazel flags.
-declare -a BAZEL_FLAGS=(
-  "--show_timestamps"
-  "--test_output=errors"
-  "--keep_going"
-  "--verbose_failures=true"
-)
-# If running via kokoro, use the remote config.
-if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
-  BAZEL_FLAGS+=(
-    "--config=remote"
-  )
-fi
-declare -r BAZEL_FLAGS
-
-# Wrap bazel.
-function build() {
-  bazel build "${BAZEL_FLAGS[@]}" "$@" 2>&1 \
-    | tee /dev/fd/2 \
-    | grep -E '^  bazel-bin/' \
-    | awk '{ print $1; }'
-}
-
-function test() {
-  bazel test "${BAZEL_FLAGS[@]}" "$@"
-}
-
-function run() {
-  local binary=$1
-  shift
-  bazel run "${binary}" -- "$@"
-}
-
-function run_as_root() {
-  local binary=$1
-  shift
-  bazel run --run_under="sudo" "${binary}" -- "$@"
-}
-
-function query() {
- bazel query "$@"
-}
-
-function collect_logs() {
-  # Zip out everything into a convenient form.
-  if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
-    # Merge results files of all shards for each test suite.
-    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
-      junitparser merge `find $d -name test.xml` $d/test.xml
-      cat $d/shard_*_of_*/test.log > $d/test.log
-      if ls -ld $d/shard_*_of_*/test.outputs 2>/dev/null; then
-        zip -r -1 "$d/outputs.zip" $d/shard_*_of_*/test.outputs
-      fi
-    done
-    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
-    # Move test logs to Kokoro directory. tar is used to conveniently perform
-    # renames while moving files.
-    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-      tar --create --files-from - --transform 's/test\./sponge_log./' |
-      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
-
-    # Collect sentry logs, if any.
-    if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then
-      # Check if the directory is empty or not (only the first line it needed).
-      local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1)
-      if [[ "${logs}" ]]; then
-        local -r archive=runsc_logs_"${RUNTIME}".tar.gz
-        if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then
-          echo "runsc logs will be uploaded to:"
-          echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
-          echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
-        fi
-        time tar \
-          --verbose \
-          --create \
-          --gzip \
-          --file="${KOKORO_ARTIFACTS_DIR}/${archive}" \
-          --directory "${RUNSC_LOGS_DIR}" \
-          .
-      fi
-    fi
-  fi
-}
-
-function find_branch_name() {
-  git branch --show-current \
-    || git rev-parse HEAD \
-    || bazel info workspace \
-    | xargs basename
-}
diff --git a/scripts/dev.sh b/scripts/dev.sh
deleted file mode 100755
index a9107f33e..000000000
--- a/scripts/dev.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# common.sh sets '-x', but it's annoying to see so much output.
-set +x
-
-# Defaults
-declare -i REFRESH=0
-declare NAME=$(find_branch_name)
-
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --refresh)
-      REFRESH=1
-      ;;
-    --help)
-      echo "Use this script to build and install runsc with Docker."
-      echo
-      echo "usage: $0 [--refresh] [runtime_name]"
-      exit 1
-      ;;
-    *)
-      NAME=$1
-      ;;
-  esac
-  shift
-done
-
-set_runtime "${NAME}"
-echo
-echo "Using runtime=${RUNTIME}"
-echo
-
-echo Building runsc...
-# Build first and fail on error. $() prevents "set -e" from reporting errors.
-build //runsc
-declare OUTPUT="$(build //runsc)"
-
-if [[ ${REFRESH} -eq 0 ]]; then
-  install_runsc "${RUNTIME}"   --net-raw
-  install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets
-  install_runsc "${RUNTIME}-p" --net-raw --profile
-
-  echo
-  echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed."
-  echo "Use --runtime="${RUNTIME}" with your Docker command."
-  echo "  docker run --rm --runtime="${RUNTIME}" hello-world"
-  echo
-  echo "If you rebuild, use $0 --refresh."
-
-else
-  mkdir -p "$(dirname ${RUNSC_BIN})"
-  cp -f ${OUTPUT} "${RUNSC_BIN}"
-  chmod a+rx "${RUNSC_BIN}"
-
-  echo
-  echo "Runtime ${RUNTIME} refreshed."
-fi
-
-echo "Logs are in: ${RUNSC_LOGS_DIR}"
diff --git a/scripts/do_tests.sh b/scripts/do_tests.sh
deleted file mode 100755
index a3a387c37..000000000
--- a/scripts/do_tests.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Build runsc.
-build //runsc
-
-# run runsc do without root privileges.
-run //runsc --rootless do true
-run //runsc --rootless --network=none do true
-
-# run runsc do with root privileges.
-run_as_root //runsc do true
diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh
deleted file mode 100755
index 4f3867d05..000000000
--- a/scripts/docker_tests.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-
-install_runsc_for_test docker
-test_runsc //test/image:image_test //test/e2e:integration_test
-
-install_runsc_for_test docker --vfs2
-test_runsc //test/e2e:integration_test //test/image:image_test
diff --git a/scripts/go.sh b/scripts/go.sh
deleted file mode 100755
index 626ed8fa4..000000000
--- a/scripts/go.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Build the go path.
-build :gopath
-
-# Build the synthetic branch.
-tools/go_branch.sh
-
-# Checkout the new branch.
-git checkout go && git clean -f
-
-go version
-
-# Build everything.
-go build ./...
-
-# Push, if required.
-if [[ -v KOKORO_GO_PUSH ]] && [[ "${KOKORO_GO_PUSH}" == "true" ]]; then
-  if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
-    git config --global credential.helper cache
-    git credential approve <<EOF
-protocol=https
-host=github.com
-username=$(cat "${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}")
-password=x-oauth-basic
-EOF
-  fi
-  git push origin go:go
-fi
diff --git a/scripts/hostnet_tests.sh b/scripts/hostnet_tests.sh
deleted file mode 100755
index 992db50dd..000000000
--- a/scripts/hostnet_tests.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-
-# Install the runtime and perform basic tests.
-install_runsc_for_test hostnet --network=host
-test_runsc --test_arg=-checkpoint=false //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
deleted file mode 100755
index 8299a7c8b..000000000
--- a/scripts/iptables_tests.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-iptables
-
-# Needed by ip6tables.
-sudo modprobe ip6table_filter
-
-install_runsc_for_test iptables --net-raw
-test //test/iptables:iptables_test "--test_arg=--runtime=runc"
-test //test/iptables:iptables_test "--test_arg=--runtime=${RUNTIME}"
diff --git a/scripts/kvm_tests.sh b/scripts/kvm_tests.sh
deleted file mode 100755
index 619571c74..000000000
--- a/scripts/kvm_tests.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-
-# Ensure that KVM is loaded, and we can use it.
-(lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm
-sudo chmod a+rw /dev/kvm
-
-# Run all KVM platform tests (locally).
-run_as_root //pkg/sentry/platform/kvm:kvm_test
-
-# Install the KVM runtime and run all integration tests.
-install_runsc_for_test kvm --platform=kvm
-test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/make_tests.sh b/scripts/make_tests.sh
deleted file mode 100755
index dbf1bba77..000000000
--- a/scripts/make_tests.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make runsc
-make bazel-shutdown
diff --git a/scripts/overlay_tests.sh b/scripts/overlay_tests.sh
deleted file mode 100755
index 448864953..000000000
--- a/scripts/overlay_tests.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-
-# Install the runtime and perform basic tests.
-install_runsc_for_test overlay --overlay
-test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/packetdrill_tests.sh b/scripts/packetdrill_tests.sh
deleted file mode 100755
index 1a8181ac8..000000000
--- a/scripts/packetdrill_tests.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-packetdrill
-
-install_runsc_for_test runsc-d
-QUERY_RESULT=$(query "attr(tags, manual, tests(//test/packetdrill/...))")
-test_runsc $QUERY_RESULT
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
deleted file mode 100755
index 77fb84bc3..000000000
--- a/scripts/packetimpact_tests.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-packetimpact
-
-install_runsc_for_test runsc-d
-QUERY_RESULT=$(query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
-test_runsc $QUERY_RESULT
diff --git a/scripts/root_tests.sh b/scripts/root_tests.sh
deleted file mode 100755
index 3eb735e62..000000000
--- a/scripts/root_tests.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-CONTAINERD_VERSION=1.3.4 make sudo TARGETS="tools/installers:containerd"
-make sudo TARGETS="tools/installers:shim"
-
-# Run the tests that require root.
-install_runsc_for_test root
-run_as_root //test/root:root_test --runtime=${RUNTIME}
diff --git a/scripts/runtime_tests.sh b/scripts/runtime_tests.sh
deleted file mode 100755
index 85e95d45d..000000000
--- a/scripts/runtime_tests.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Check that a runtime is provided.
-if [ ! -v RUNTIME_TEST_NAME ]; then
-  echo "Must set $RUNTIME_TEST_NAME" >&2
-  exit 1
-fi
-
-# Download language runtime image.
-make -C images/ "load-runtimes_${RUNTIME_TEST_NAME}"
-
-install_runsc_for_test runtimes
-test_runsc "//test/runtimes:${RUNTIME_TEST_NAME}"
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
deleted file mode 100755
index 585216aae..000000000
--- a/scripts/simple_tests.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Run all simple tests (locally).
-test //pkg/... //runsc/... //tools/...
diff --git a/scripts/swgso_tests.sh b/scripts/swgso_tests.sh
deleted file mode 100755
index c67f2fe5c..000000000
--- a/scripts/swgso_tests.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-make load-all-images
-
-# Install the runtime and perform basic tests.
-install_runsc_for_test swgso --software-gso=true --gso=false
-test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/syscall_kvm_tests.sh b/scripts/syscall_kvm_tests.sh
deleted file mode 100755
index 0e5d86727..000000000
--- a/scripts/syscall_kvm_tests.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Run all ptrace-variants of the system call tests.
-test --test_tag_filters=runsc_kvm //test/syscalls/...
diff --git a/scripts/syscall_tests.sh b/scripts/syscall_tests.sh
deleted file mode 100755
index a131b2d50..000000000
--- a/scripts/syscall_tests.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Run all ptrace-variants of the system call tests.
-test --test_tag_filters=runsc_ptrace //test/syscalls/...
diff --git a/shim/BUILD b/shim/BUILD
index e581618b2..8d29c459b 100644
--- a/shim/BUILD
+++ b/shim/BUILD
@@ -10,6 +10,6 @@ pkg_tar(
     mode = "0644",
     package_dir = "/etc/containerd",
     visibility = [
-        "//runsc:__pkg__",
+        "//visibility:public",
     ],
 )
diff --git a/shim/v1/BUILD b/shim/v1/BUILD
index 4c9e2c2c6..3614a67d1 100644
--- a/shim/v1/BUILD
+++ b/shim/v1/BUILD
@@ -4,27 +4,10 @@ package(licenses = ["notice"])
 
 go_binary(
     name = "gvisor-containerd-shim",
-    srcs = [
-        "api.go",
-        "config.go",
-        "main.go",
-    ],
+    srcs = ["main.go"],
     static = True,
     visibility = [
         "//visibility:public",
     ],
-    deps = [
-        "//pkg/shim/runsc",
-        "//pkg/shim/v1/shim",
-        "@com_github_burntsushi_toml//:go_default_library",
-        "@com_github_containerd_containerd//events:go_default_library",
-        "@com_github_containerd_containerd//namespaces:go_default_library",
-        "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
-        "@com_github_containerd_containerd//sys:go_default_library",
-        "@com_github_containerd_containerd//sys/reaper:go_default_library",
-        "@com_github_containerd_ttrpc//:go_default_library",
-        "@com_github_containerd_typeurl//:go_default_library",
-        "@com_github_gogo_protobuf//types:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
+    deps = ["//shim/v1/cli"],
 )
diff --git a/shim/v1/cli/BUILD b/shim/v1/cli/BUILD
new file mode 100644
index 000000000..0bbdc4add
--- /dev/null
+++ b/shim/v1/cli/BUILD
@@ -0,0 +1,30 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cli",
+    srcs = [
+        "api.go",
+        "cli.go",
+        "config.go",
+    ],
+    visibility = [
+        "//:__pkg__",
+        "//shim/v1:__pkg__",
+    ],
+    deps = [
+        "//pkg/shim/runsc",
+        "//pkg/shim/v1/shim",
+        "@com_github_burntsushi_toml//:go_default_library",
+        "@com_github_containerd_containerd//events:go_default_library",
+        "@com_github_containerd_containerd//namespaces:go_default_library",
+        "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
+        "@com_github_containerd_containerd//sys:go_default_library",
+        "@com_github_containerd_containerd//sys/reaper:go_default_library",
+        "@com_github_containerd_ttrpc//:go_default_library",
+        "@com_github_containerd_typeurl//:go_default_library",
+        "@com_github_gogo_protobuf//types:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/shim/v1/api.go b/shim/v1/cli/api.go
index 2444d23f1..050793094 100644
--- a/shim/v1/api.go
+++ b/shim/v1/cli/api.go
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package cli
 
 import (
 	shim "github.com/containerd/containerd/runtime/v1/shim/v1"
diff --git a/shim/v1/cli/cli.go b/shim/v1/cli/cli.go
new file mode 100644
index 000000000..1a502eabd
--- /dev/null
+++ b/shim/v1/cli/cli.go
@@ -0,0 +1,267 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cli defines the command line interface for the V1 shim.
+package cli
+
+import (
+	"bytes"
+	"context"
+	"flag"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"sync"
+	"syscall"
+
+	"github.com/containerd/containerd/events"
+	"github.com/containerd/containerd/namespaces"
+	"github.com/containerd/containerd/sys"
+	"github.com/containerd/containerd/sys/reaper"
+	"github.com/containerd/ttrpc"
+	"github.com/containerd/typeurl"
+	"github.com/gogo/protobuf/types"
+	"golang.org/x/sys/unix"
+
+	"gvisor.dev/gvisor/pkg/shim/runsc"
+	"gvisor.dev/gvisor/pkg/shim/v1/shim"
+)
+
+var (
+	debugFlag            bool
+	namespaceFlag        string
+	socketFlag           string
+	addressFlag          string
+	workdirFlag          string
+	runtimeRootFlag      string
+	containerdBinaryFlag string
+	shimConfigFlag       string
+)
+
+// Containerd defaults to runc, unless another runtime is explicitly specified.
+// We keep the same default to make the default behavior consistent.
+const defaultRoot = "/run/containerd/runc"
+
+func init() {
+	flag.BoolVar(&debugFlag, "debug", false, "enable debug output in logs")
+	flag.StringVar(&namespaceFlag, "namespace", "", "namespace that owns the shim")
+	flag.StringVar(&socketFlag, "socket", "", "abstract socket path to serve")
+	flag.StringVar(&addressFlag, "address", "", "grpc address back to main containerd")
+	flag.StringVar(&workdirFlag, "workdir", "", "path used to storge large temporary data")
+	flag.StringVar(&runtimeRootFlag, "runtime-root", defaultRoot, "root directory for the runtime")
+
+	// Currently, the `containerd publish` utility is embedded in the
+	// daemon binary.  The daemon invokes `containerd-shim
+	// -containerd-binary ...` with its own os.Executable() path.
+	flag.StringVar(&containerdBinaryFlag, "containerd-binary", "containerd", "path to containerd binary (used for `containerd publish`)")
+	flag.StringVar(&shimConfigFlag, "config", "/etc/containerd/runsc.toml", "path to the shim configuration file")
+}
+
+// Main is the main entrypoint.
+func Main() {
+	flag.Parse()
+
+	// This is a hack. Exec current process to run standard containerd-shim
+	// if runtime root is not `runsc`. We don't need this for shim v2 api.
+	if filepath.Base(runtimeRootFlag) != "runsc" {
+		if err := executeRuncShim(); err != nil {
+			fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err)
+			os.Exit(1)
+		}
+	}
+
+	// Run regular shim if needed.
+	if err := executeShim(); err != nil {
+		fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err)
+		os.Exit(1)
+	}
+}
+
+// executeRuncShim execs current process to a containerd-shim process and
+// retains all flags and envs.
+func executeRuncShim() error {
+	c, err := loadConfig(shimConfigFlag)
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to load shim config: %w", err)
+	}
+	shimPath := c.RuncShim
+	if shimPath == "" {
+		shimPath, err = exec.LookPath("containerd-shim")
+		if err != nil {
+			return fmt.Errorf("lookup containerd-shim failed: %w", err)
+		}
+	}
+
+	args := append([]string{shimPath}, os.Args[1:]...)
+	if err := syscall.Exec(shimPath, args, os.Environ()); err != nil {
+		return fmt.Errorf("exec containerd-shim @ %q failed: %w", shimPath, err)
+	}
+	return nil
+}
+
+func executeShim() error {
+	// start handling signals as soon as possible so that things are
+	// properly reaped or if runtime exits before we hit the handler.
+	signals, err := setupSignals()
+	if err != nil {
+		return err
+	}
+	path, err := os.Getwd()
+	if err != nil {
+		return err
+	}
+	server, err := ttrpc.NewServer(ttrpc.WithServerHandshaker(ttrpc.UnixSocketRequireSameUser()))
+	if err != nil {
+		return fmt.Errorf("failed creating server: %w", err)
+	}
+	c, err := loadConfig(shimConfigFlag)
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to load shim config: %w", err)
+	}
+	sv, err := shim.NewService(
+		shim.Config{
+			Path:        path,
+			Namespace:   namespaceFlag,
+			WorkDir:     workdirFlag,
+			RuntimeRoot: runtimeRootFlag,
+			RunscConfig: c.RunscConfig,
+		},
+		&remoteEventsPublisher{address: addressFlag},
+	)
+	if err != nil {
+		return err
+	}
+	registerShimService(server, sv)
+	if err := serve(server, socketFlag); err != nil {
+		return err
+	}
+	return handleSignals(signals, server, sv)
+}
+
+// serve serves the ttrpc API over a unix socket at the provided path this
+// function does not block.
+func serve(server *ttrpc.Server, path string) error {
+	var (
+		l   net.Listener
+		err error
+	)
+	if path == "" {
+		l, err = net.FileListener(os.NewFile(3, "socket"))
+		path = "[inherited from parent]"
+	} else {
+		if len(path) > 106 {
+			return fmt.Errorf("%q: unix socket path too long (> 106)", path)
+		}
+		l, err = net.Listen("unix", "\x00"+path)
+	}
+	if err != nil {
+		return err
+	}
+	go func() {
+		defer l.Close()
+		err := server.Serve(context.Background(), l)
+		if err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
+			log.Fatalf("ttrpc server failure: %v", err)
+		}
+	}()
+	return nil
+}
+
+// setupSignals creates a new signal handler for all signals and sets the shim
+// as a sub-reaper so that the container processes are reparented.
+func setupSignals() (chan os.Signal, error) {
+	signals := make(chan os.Signal, 32)
+	signal.Notify(signals, unix.SIGTERM, unix.SIGINT, unix.SIGCHLD, unix.SIGPIPE)
+	// make sure runc is setup to use the monitor for waiting on processes.
+	// TODO(random-liu): Move shim/reaper.go to a separate package.
+	runsc.Monitor = reaper.Default
+	// Set the shim as the subreaper for all orphaned processes created by
+	// the container.
+	if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil {
+		return nil, err
+	}
+	return signals, nil
+}
+
+func handleSignals(signals chan os.Signal, server *ttrpc.Server, sv *shim.Service) error {
+	var (
+		termOnce sync.Once
+		done     = make(chan struct{})
+	)
+
+	for {
+		select {
+		case <-done:
+			return nil
+		case s := <-signals:
+			switch s {
+			case unix.SIGCHLD:
+				if _, err := sys.Reap(false); err != nil {
+					log.Printf("reap error: %v", err)
+				}
+			case unix.SIGTERM, unix.SIGINT:
+				go termOnce.Do(func() {
+					ctx := context.TODO()
+					if err := server.Shutdown(ctx); err != nil {
+						log.Printf("failed to shutdown server: %v", err)
+					}
+					// Ensure our child is dead if any.
+					sv.Kill(ctx, &KillRequest{
+						Signal: uint32(syscall.SIGKILL),
+						All:    true,
+					})
+					sv.Delete(context.Background(), &types.Empty{})
+					close(done)
+				})
+			case unix.SIGPIPE:
+			}
+		}
+	}
+}
+
+type remoteEventsPublisher struct {
+	address string
+}
+
+func (l *remoteEventsPublisher) Publish(ctx context.Context, topic string, event events.Event) error {
+	ns, _ := namespaces.Namespace(ctx)
+	encoded, err := typeurl.MarshalAny(event)
+	if err != nil {
+		return err
+	}
+	data, err := encoded.Marshal()
+	if err != nil {
+		return err
+	}
+	cmd := exec.CommandContext(ctx, containerdBinaryFlag, "--address", l.address, "publish", "--topic", topic, "--namespace", ns)
+	cmd.Stdin = bytes.NewReader(data)
+	c, err := reaper.Default.Start(cmd)
+	if err != nil {
+		return err
+	}
+	status, err := reaper.Default.Wait(cmd, c)
+	if err != nil {
+		return fmt.Errorf("failed to publish event: %w", err)
+	}
+	if status != 0 {
+		return fmt.Errorf("failed to publish event: status %d", status)
+	}
+	return nil
+}
diff --git a/shim/v1/config.go b/shim/v1/cli/config.go
index a72cc7754..1be9597ed 100644
--- a/shim/v1/config.go
+++ b/shim/v1/cli/config.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package cli
 
 import "github.com/BurntSushi/toml"
 
diff --git a/shim/v1/main.go b/shim/v1/main.go
index 3159923af..11ff4add1 100644
--- a/shim/v1/main.go
+++ b/shim/v1/main.go
@@ -1,5 +1,4 @@
-// Copyright 2018 The containerd Authors.
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,253 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Binary gvisor-containerd-shim is the v1 containerd shim.
 package main
 
 import (
-	"bytes"
-	"context"
-	"flag"
-	"fmt"
-	"log"
-	"net"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"sync"
-	"syscall"
-
-	"github.com/containerd/containerd/events"
-	"github.com/containerd/containerd/namespaces"
-	"github.com/containerd/containerd/sys"
-	"github.com/containerd/containerd/sys/reaper"
-	"github.com/containerd/ttrpc"
-	"github.com/containerd/typeurl"
-	"github.com/gogo/protobuf/types"
-	"golang.org/x/sys/unix"
-
-	"gvisor.dev/gvisor/pkg/shim/runsc"
-	"gvisor.dev/gvisor/pkg/shim/v1/shim"
-)
-
-var (
-	debugFlag            bool
-	namespaceFlag        string
-	socketFlag           string
-	addressFlag          string
-	workdirFlag          string
-	runtimeRootFlag      string
-	containerdBinaryFlag string
-	shimConfigFlag       string
+	"gvisor.dev/gvisor/shim/v1/cli"
 )
 
-// Containerd defaults to runc, unless another runtime is explicitly specified.
-// We keep the same default to make the default behavior consistent.
-const defaultRoot = "/run/containerd/runc"
-
-func init() {
-	flag.BoolVar(&debugFlag, "debug", false, "enable debug output in logs")
-	flag.StringVar(&namespaceFlag, "namespace", "", "namespace that owns the shim")
-	flag.StringVar(&socketFlag, "socket", "", "abstract socket path to serve")
-	flag.StringVar(&addressFlag, "address", "", "grpc address back to main containerd")
-	flag.StringVar(&workdirFlag, "workdir", "", "path used to storge large temporary data")
-	flag.StringVar(&runtimeRootFlag, "runtime-root", defaultRoot, "root directory for the runtime")
-
-	// Currently, the `containerd publish` utility is embedded in the
-	// daemon binary.  The daemon invokes `containerd-shim
-	// -containerd-binary ...` with its own os.Executable() path.
-	flag.StringVar(&containerdBinaryFlag, "containerd-binary", "containerd", "path to containerd binary (used for `containerd publish`)")
-	flag.StringVar(&shimConfigFlag, "config", "/etc/containerd/runsc.toml", "path to the shim configuration file")
-}
-
 func main() {
-	flag.Parse()
-
-	// This is a hack. Exec current process to run standard containerd-shim
-	// if runtime root is not `runsc`. We don't need this for shim v2 api.
-	if filepath.Base(runtimeRootFlag) != "runsc" {
-		if err := executeRuncShim(); err != nil {
-			fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err)
-			os.Exit(1)
-		}
-	}
-
-	// Run regular shim if needed.
-	if err := executeShim(); err != nil {
-		fmt.Fprintf(os.Stderr, "gvisor-containerd-shim: %s\n", err)
-		os.Exit(1)
-	}
-}
-
-// executeRuncShim execs current process to a containerd-shim process and
-// retains all flags and envs.
-func executeRuncShim() error {
-	c, err := loadConfig(shimConfigFlag)
-	if err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("failed to load shim config: %w", err)
-	}
-	shimPath := c.RuncShim
-	if shimPath == "" {
-		shimPath, err = exec.LookPath("containerd-shim")
-		if err != nil {
-			return fmt.Errorf("lookup containerd-shim failed: %w", err)
-		}
-	}
-
-	args := append([]string{shimPath}, os.Args[1:]...)
-	if err := syscall.Exec(shimPath, args, os.Environ()); err != nil {
-		return fmt.Errorf("exec containerd-shim @ %q failed: %w", shimPath, err)
-	}
-	return nil
-}
-
-func executeShim() error {
-	// start handling signals as soon as possible so that things are
-	// properly reaped or if runtime exits before we hit the handler.
-	signals, err := setupSignals()
-	if err != nil {
-		return err
-	}
-	path, err := os.Getwd()
-	if err != nil {
-		return err
-	}
-	server, err := ttrpc.NewServer(ttrpc.WithServerHandshaker(ttrpc.UnixSocketRequireSameUser()))
-	if err != nil {
-		return fmt.Errorf("failed creating server: %w", err)
-	}
-	c, err := loadConfig(shimConfigFlag)
-	if err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("failed to load shim config: %w", err)
-	}
-	sv, err := shim.NewService(
-		shim.Config{
-			Path:        path,
-			Namespace:   namespaceFlag,
-			WorkDir:     workdirFlag,
-			RuntimeRoot: runtimeRootFlag,
-			RunscConfig: c.RunscConfig,
-		},
-		&remoteEventsPublisher{address: addressFlag},
-	)
-	if err != nil {
-		return err
-	}
-	registerShimService(server, sv)
-	if err := serve(server, socketFlag); err != nil {
-		return err
-	}
-	return handleSignals(signals, server, sv)
-}
-
-// serve serves the ttrpc API over a unix socket at the provided path this
-// function does not block.
-func serve(server *ttrpc.Server, path string) error {
-	var (
-		l   net.Listener
-		err error
-	)
-	if path == "" {
-		l, err = net.FileListener(os.NewFile(3, "socket"))
-		path = "[inherited from parent]"
-	} else {
-		if len(path) > 106 {
-			return fmt.Errorf("%q: unix socket path too long (> 106)", path)
-		}
-		l, err = net.Listen("unix", "\x00"+path)
-	}
-	if err != nil {
-		return err
-	}
-	go func() {
-		defer l.Close()
-		err := server.Serve(context.Background(), l)
-		if err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
-			log.Fatalf("ttrpc server failure: %v", err)
-		}
-	}()
-	return nil
-}
-
-// setupSignals creates a new signal handler for all signals and sets the shim
-// as a sub-reaper so that the container processes are reparented.
-func setupSignals() (chan os.Signal, error) {
-	signals := make(chan os.Signal, 32)
-	signal.Notify(signals, unix.SIGTERM, unix.SIGINT, unix.SIGCHLD, unix.SIGPIPE)
-	// make sure runc is setup to use the monitor for waiting on processes.
-	// TODO(random-liu): Move shim/reaper.go to a separate package.
-	runsc.Monitor = reaper.Default
-	// Set the shim as the subreaper for all orphaned processes created by
-	// the container.
-	if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil {
-		return nil, err
-	}
-	return signals, nil
-}
-
-func handleSignals(signals chan os.Signal, server *ttrpc.Server, sv *shim.Service) error {
-	var (
-		termOnce sync.Once
-		done     = make(chan struct{})
-	)
-
-	for {
-		select {
-		case <-done:
-			return nil
-		case s := <-signals:
-			switch s {
-			case unix.SIGCHLD:
-				if _, err := sys.Reap(false); err != nil {
-					log.Printf("reap error: %v", err)
-				}
-			case unix.SIGTERM, unix.SIGINT:
-				go termOnce.Do(func() {
-					ctx := context.TODO()
-					if err := server.Shutdown(ctx); err != nil {
-						log.Printf("failed to shutdown server: %v", err)
-					}
-					// Ensure our child is dead if any.
-					sv.Kill(ctx, &KillRequest{
-						Signal: uint32(syscall.SIGKILL),
-						All:    true,
-					})
-					sv.Delete(context.Background(), &types.Empty{})
-					close(done)
-				})
-			case unix.SIGPIPE:
-			}
-		}
-	}
-}
-
-type remoteEventsPublisher struct {
-	address string
-}
-
-func (l *remoteEventsPublisher) Publish(ctx context.Context, topic string, event events.Event) error {
-	ns, _ := namespaces.Namespace(ctx)
-	encoded, err := typeurl.MarshalAny(event)
-	if err != nil {
-		return err
-	}
-	data, err := encoded.Marshal()
-	if err != nil {
-		return err
-	}
-	cmd := exec.CommandContext(ctx, containerdBinaryFlag, "--address", l.address, "publish", "--topic", topic, "--namespace", ns)
-	cmd.Stdin = bytes.NewReader(data)
-	c, err := reaper.Default.Start(cmd)
-	if err != nil {
-		return err
-	}
-	status, err := reaper.Default.Wait(cmd, c)
-	if err != nil {
-		return fmt.Errorf("failed to publish event: %w", err)
-	}
-	if status != 0 {
-		return fmt.Errorf("failed to publish event: status %d", status)
-	}
-	return nil
+	cli.Main()
 }
diff --git a/shim/v2/BUILD b/shim/v2/BUILD
index 8de9ac0ba..b4a107d27 100644
--- a/shim/v2/BUILD
+++ b/shim/v2/BUILD
@@ -4,15 +4,10 @@ package(licenses = ["notice"])
 
 go_binary(
     name = "containerd-shim-runsc-v1",
-    srcs = [
-        "main.go",
-    ],
+    srcs = ["main.go"],
     static = True,
     visibility = [
         "//visibility:public",
     ],
-    deps = [
-        "//pkg/shim/v2",
-        "@com_github_containerd_containerd//runtime/v2/shim:go_default_library",
-    ],
+    deps = ["//shim/v2/cli"],
 )
diff --git a/shim/v2/cli/BUILD b/shim/v2/cli/BUILD
new file mode 100644
index 000000000..6681e0772
--- /dev/null
+++ b/shim/v2/cli/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cli",
+    srcs = ["cli.go"],
+    visibility = [
+        "//:__pkg__",
+        "//shim/v2:__pkg__",
+    ],
+    deps = [
+        "//pkg/shim/v2",
+        "@com_github_containerd_containerd//runtime/v2/shim:go_default_library",
+    ],
+)
diff --git a/shim/v2/cli/cli.go b/shim/v2/cli/cli.go
new file mode 100644
index 000000000..3d6644feb
--- /dev/null
+++ b/shim/v2/cli/cli.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cli defines the command line interface for the V2 shim.
+package cli
+
+import (
+	"github.com/containerd/containerd/runtime/v2/shim"
+
+	"gvisor.dev/gvisor/pkg/shim/v2"
+)
+
+// Main is the main entrypoint.
+func Main() {
+	shim.Run("io.containerd.runsc.v1", v2.New)
+}
diff --git a/shim/v2/main.go b/shim/v2/main.go
index 753871eea..3680cdf9c 100644
--- a/shim/v2/main.go
+++ b/shim/v2/main.go
@@ -1,5 +1,4 @@
-// Copyright 2018 The containerd Authors.
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Binary containerd-shim-runsc-v1 is the v2 containerd shim (implementing the formal v1 API).
 package main
 
 import (
-	"github.com/containerd/containerd/runtime/v2/shim"
-
-	"gvisor.dev/gvisor/pkg/shim/v2"
+	"gvisor.dev/gvisor/shim/v2/cli"
 )
 
 func main() {
-	shim.Run("io.containerd.runsc.v1", v2.New)
+	cli.Main()
 }
diff --git a/test/README.md b/test/README.md
index 02bbf42ff..15b0f4c33 100644
--- a/test/README.md
+++ b/test/README.md
@@ -24,11 +24,11 @@ also used to run these tests in `kokoro`.
 
 To run image and integration tests, run:
 
-`./scripts/docker_tests.sh`
+`make docker-tests`
 
 To run root tests, run:
 
-`./scripts/root_tests.sh`
+`make root-tests`
 
 There are a few other interesting variations for image and integration tests:
 
diff --git a/test/benchmarks/base/size_test.go b/test/benchmarks/base/size_test.go
index 3c1364faf..7d3877459 100644
--- a/test/benchmarks/base/size_test.go
+++ b/test/benchmarks/base/size_test.go
@@ -105,6 +105,7 @@ func BenchmarkSizeNginx(b *testing.B) {
 			machine: machine,
 			port:    port,
 			runOpts: runOpts,
+			cmd:     []string{"nginx", "-c", "/etc/nginx/nginx_gofer.conf"},
 		})
 	defer cleanUpContainers(ctx, servers)
 
diff --git a/test/benchmarks/base/startup_test.go b/test/benchmarks/base/startup_test.go
index 4628a0a41..c36a544db 100644
--- a/test/benchmarks/base/startup_test.go
+++ b/test/benchmarks/base/startup_test.go
@@ -64,6 +64,7 @@ func BenchmarkStartupNginx(b *testing.B) {
 			machine: machine,
 			runOpts: runOpts,
 			port:    80,
+			cmd:     []string{"nginx", "-c", "/etc/nginx/nginx_gofer.conf"},
 		})
 }
 
@@ -123,8 +124,6 @@ func redisInstance(ctx context.Context, b *testing.B, machine harness.Machine) (
 // runServerWorkload runs a server workload defined by 'runOpts' and 'cmd'.
 // 'clientMachine' is used to connect to the server on 'serverMachine'.
 func runServerWorkload(ctx context.Context, b *testing.B, args serverArgs) {
-	b.Helper()
-
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		if err := func() error {
diff --git a/test/benchmarks/base/sysbench_test.go b/test/benchmarks/base/sysbench_test.go
index 6fb813640..39ced3dab 100644
--- a/test/benchmarks/base/sysbench_test.go
+++ b/test/benchmarks/base/sysbench_test.go
@@ -71,8 +71,15 @@ func BenchmarkSysbench(b *testing.B) {
 	defer machine.CleanUp()
 
 	for _, tc := range testCases {
-		b.Run(tc.name, func(b *testing.B) {
-
+		param := tools.Parameter{
+			Name:  "testname",
+			Value: tc.name,
+		}
+		name, err := tools.ParametersToName(param)
+		if err != nil {
+			b.Fatalf("Failed to parse params: %v", err)
+		}
+		b.Run(name, func(b *testing.B) {
 			ctx := context.Background()
 			sysbench := machine.GetContainer(ctx, b)
 			defer sysbench.CleanUp(ctx)
diff --git a/test/benchmarks/database/redis_test.go b/test/benchmarks/database/redis_test.go
index 394fce820..02e67154e 100644
--- a/test/benchmarks/database/redis_test.go
+++ b/test/benchmarks/database/redis_test.go
@@ -66,7 +66,15 @@ func BenchmarkRedis(b *testing.B) {
 	ctx := context.Background()
 
 	for _, operation := range operations {
-		b.Run(operation, func(b *testing.B) {
+		param := tools.Parameter{
+			Name:  "operation",
+			Value: operation,
+		}
+		name, err := tools.ParametersToName(param)
+		if err != nil {
+			b.Fatalf("Failed to parse paramaters: %v", err)
+		}
+		b.Run(name, func(b *testing.B) {
 			server := serverMachine.GetContainer(ctx, b)
 			defer server.CleanUp(ctx)
 
@@ -84,12 +92,12 @@ func BenchmarkRedis(b *testing.B) {
 
 			ip, err := serverMachine.IPAddress()
 			if err != nil {
-				b.Fatal("failed to get IP from server: %v", err)
+				b.Fatalf("failed to get IP from server: %v", err)
 			}
 
 			serverPort, err := server.FindPort(ctx, port)
 			if err != nil {
-				b.Fatal("failed to get IP from server: %v", err)
+				b.Fatalf("failed to get IP from server: %v", err)
 			}
 
 			if err = harness.WaitUntilServing(ctx, clientMachine, ip, serverPort); err != nil {
diff --git a/test/benchmarks/fs/bazel_test.go b/test/benchmarks/fs/bazel_test.go
index f4236ba37..56103639d 100644
--- a/test/benchmarks/fs/bazel_test.go
+++ b/test/benchmarks/fs/bazel_test.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/test/benchmarks/harness"
+	"gvisor.dev/gvisor/test/benchmarks/tools"
 )
 
 // Note: CleanCache versions of this test require running with root permissions.
@@ -46,23 +47,42 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) {
 	// Dimensions here are clean/dirty cache (do or don't drop caches)
 	// and if the mount on which we are compiling is a tmpfs/bind mount.
 	benchmarks := []struct {
-		name       string
 		clearCache bool // clearCache drops caches before running.
 		tmpfs      bool // tmpfs will run compilation on a tmpfs.
 	}{
-		{name: "CleanCache", clearCache: true, tmpfs: false},
-		{name: "DirtyCache", clearCache: false, tmpfs: false},
-		{name: "CleanCacheTmpfs", clearCache: true, tmpfs: true},
-		{name: "DirtyCacheTmpfs", clearCache: false, tmpfs: true},
+		{clearCache: true, tmpfs: false},
+		{clearCache: false, tmpfs: false},
+		{clearCache: true, tmpfs: true},
+		{clearCache: false, tmpfs: true},
 	}
 	for _, bm := range benchmarks {
-		b.Run(bm.name, func(b *testing.B) {
+		pageCache := tools.Parameter{
+			Name:  "page_cache",
+			Value: "clean",
+		}
+		if bm.clearCache {
+			pageCache.Value = "dirty"
+		}
+
+		filesystem := tools.Parameter{
+			Name:  "filesystem",
+			Value: "bind",
+		}
+		if bm.tmpfs {
+			filesystem.Value = "tmpfs"
+		}
+		name, err := tools.ParametersToName(pageCache, filesystem)
+		if err != nil {
+			b.Fatalf("Failed to parse parameters: %v", err)
+		}
+
+		b.Run(name, func(b *testing.B) {
 			// Grab a container.
 			ctx := context.Background()
 			container := machine.GetContainer(ctx, b)
 			defer container.CleanUp(ctx)
 
-			// Start a container and sleep by an order of b.N.
+			// Start a container and sleep.
 			if err := container.Spawn(ctx, dockerutil.RunOpts{
 				Image: image,
 			}, "sleep", fmt.Sprintf("%d", 1000000)); err != nil {
@@ -70,12 +90,13 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) {
 			}
 
 			// If we are running on a tmpfs, copy to /tmp which is a tmpfs.
+			prefix := ""
 			if bm.tmpfs {
 				if out, err := container.Exec(ctx, dockerutil.ExecOpts{},
 					"cp", "-r", workdir, "/tmp/."); err != nil {
-					b.Fatal("failed to copy directory: %v %s", err, out)
+					b.Fatalf("failed to copy directory: %v (%s)", err, out)
 				}
-				workdir = "/tmp" + workdir
+				prefix = "/tmp"
 			}
 
 			// Restart profiles after the copy.
@@ -94,7 +115,7 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) {
 				b.StartTimer()
 
 				got, err := container.Exec(ctx, dockerutil.ExecOpts{
-					WorkDir: workdir,
+					WorkDir: prefix + workdir,
 				}, "bazel", "build", "-c", "opt", target)
 				if err != nil {
 					b.Fatalf("build failed with: %v", err)
@@ -107,7 +128,7 @@ func runBuildBenchmark(b *testing.B, image, workdir, target string) {
 				}
 				// Clean bazel in case we use b.N.
 				_, err = container.Exec(ctx, dockerutil.ExecOpts{
-					WorkDir: workdir,
+					WorkDir: prefix + workdir,
 				}, "bazel", "clean")
 				if err != nil {
 					b.Fatalf("build failed with: %v", err)
diff --git a/test/benchmarks/fs/fio_test.go b/test/benchmarks/fs/fio_test.go
index 65874ed8b..5ca191404 100644
--- a/test/benchmarks/fs/fio_test.go
+++ b/test/benchmarks/fs/fio_test.go
@@ -67,8 +67,19 @@ func BenchmarkFio(b *testing.B) {
 
 	for _, fsType := range []mount.Type{mount.TypeBind, mount.TypeTmpfs} {
 		for _, tc := range testCases {
-			testName := strings.Title(tc.Test) + strings.Title(string(fsType))
-			b.Run(testName, func(b *testing.B) {
+			operation := tools.Parameter{
+				Name:  "operation",
+				Value: tc.Test,
+			}
+			filesystem := tools.Parameter{
+				Name:  "filesystem",
+				Value: string(fsType),
+			}
+			name, err := tools.ParametersToName(operation, filesystem)
+			if err != nil {
+				b.Fatalf("Failed to parser paramters: %v", err)
+			}
+			b.Run(name, func(b *testing.B) {
 				ctx := context.Background()
 				container := machine.GetContainer(ctx, b)
 				defer container.CleanUp(ctx)
diff --git a/test/benchmarks/network/BUILD b/test/benchmarks/network/BUILD
index bd3f6245c..472b5c387 100644
--- a/test/benchmarks/network/BUILD
+++ b/test/benchmarks/network/BUILD
@@ -5,8 +5,15 @@ package(licenses = ["notice"])
 go_library(
     name = "network",
     testonly = 1,
-    srcs = ["network.go"],
-    deps = ["//test/benchmarks/harness"],
+    srcs = [
+        "network.go",
+        "static_server.go",
+    ],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//test/benchmarks/harness",
+        "//test/benchmarks/tools",
+    ],
 )
 
 go_test(
diff --git a/test/benchmarks/network/httpd_test.go b/test/benchmarks/network/httpd_test.go
index 336e04c91..8d7d5f750 100644
--- a/test/benchmarks/network/httpd_test.go
+++ b/test/benchmarks/network/httpd_test.go
@@ -14,120 +14,71 @@
 package network
 
 import (
-	"context"
-	"fmt"
+	"strconv"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
-	"gvisor.dev/gvisor/test/benchmarks/harness"
 	"gvisor.dev/gvisor/test/benchmarks/tools"
 )
 
 // see Dockerfile '//images/benchmarks/httpd'.
-var docs = map[string]string{
+var httpdDocs = map[string]string{
 	"notfound": "notfound",
 	"1Kb":      "latin1k.txt",
 	"10Kb":     "latin10k.txt",
 	"100Kb":    "latin100k.txt",
-	"1000Kb":   "latin1000k.txt",
 	"1Mb":      "latin1024k.txt",
 	"10Mb":     "latin10240k.txt",
 }
 
-// BenchmarkHttpdConcurrency iterates the concurrency argument and tests
-// how well the runtime under test handles requests in parallel.
-func BenchmarkHttpdConcurrency(b *testing.B) {
-	// Grab a machine for the client and server.
-	clientMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get client: %v", err)
-	}
-	defer clientMachine.CleanUp()
-
-	serverMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get server: %v", err)
-	}
-	defer serverMachine.CleanUp()
-
-	// The test iterates over client concurrency, so set other parameters.
-	concurrency := []int{1, 25, 50, 100, 1000}
-
-	for _, c := range concurrency {
-		b.Run(fmt.Sprintf("%d", c), func(b *testing.B) {
-			hey := &tools.Hey{
-				Requests:    10000,
-				Concurrency: c,
-				Doc:         docs["10Kb"],
-			}
-			runHttpd(b, clientMachine, serverMachine, hey, false /* reverse */)
-		})
-	}
-}
-
-// BenchmarkHttpdDocSize iterates over different sized payloads, testing how
-// well the runtime handles sending different payload sizes.
-func BenchmarkHttpdDocSize(b *testing.B) {
+// BenchmarkHttpd iterates over different sized payloads and concurrency, testing
+// how well the runtime handles sending different payload sizes.
+func BenchmarkHttpd(b *testing.B) {
 	benchmarkHttpdDocSize(b, false /* reverse */)
 }
 
-// BenchmarkReverseHttpdDocSize iterates over different sized payloads, testing
+// BenchmarkReverseHttpd iterates over different sized payloads, testing
 // how well the runtime handles receiving different payload sizes.
-func BenchmarkReverseHttpdDocSize(b *testing.B) {
+func BenchmarkReverseHttpd(b *testing.B) {
 	benchmarkHttpdDocSize(b, true /* reverse */)
 }
 
+// benchmarkHttpdDocSize iterates through all doc sizes, running subbenchmarks
+// for each size.
 func benchmarkHttpdDocSize(b *testing.B, reverse bool) {
 	b.Helper()
-
-	clientMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get machine: %v", err)
-	}
-	defer clientMachine.CleanUp()
-
-	serverMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get machine: %v", err)
-	}
-	defer serverMachine.CleanUp()
-
-	for name, filename := range docs {
+	for size, filename := range httpdDocs {
 		concurrency := []int{1, 25, 50, 100, 1000}
 		for _, c := range concurrency {
-			b.Run(fmt.Sprintf("%s_%d", name, c), func(b *testing.B) {
+			fsize := tools.Parameter{
+				Name:  "filesize",
+				Value: size,
+			}
+			concurrency := tools.Parameter{
+				Name:  "concurrency",
+				Value: strconv.Itoa(c),
+			}
+			name, err := tools.ParametersToName(fsize, concurrency)
+			if err != nil {
+				b.Fatalf("Failed to parse parameters: %v", err)
+			}
+			b.Run(name, func(b *testing.B) {
 				hey := &tools.Hey{
-					Requests:    10000,
+					Requests:    c * b.N,
 					Concurrency: c,
 					Doc:         filename,
 				}
-				runHttpd(b, clientMachine, serverMachine, hey, reverse)
+				runHttpd(b, hey, reverse)
 			})
 		}
 	}
 }
 
-// runHttpd runs a single test run.
-func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, hey *tools.Hey, reverse bool) {
-	b.Helper()
-
-	// Grab a container from the server.
-	ctx := context.Background()
-	var server *dockerutil.Container
-	if reverse {
-		server = serverMachine.GetNativeContainer(ctx, b)
-	} else {
-		server = serverMachine.GetContainer(ctx, b)
-	}
-
-	defer server.CleanUp(ctx)
-
-	// Copy the docs to /tmp and serve from there.
-	cmd := "mkdir -p /tmp/html; cp -r /local/* /tmp/html/.; apache2 -X"
+// runHttpd configures the static serving methods to run httpd.
+func runHttpd(b *testing.B, hey *tools.Hey, reverse bool) {
+	// httpd runs on port 80.
 	port := 80
-
-	// Start the server.
-	if err := server.Spawn(ctx, dockerutil.RunOpts{
+	httpdRunOpts := dockerutil.RunOpts{
 		Image: "benchmarks/httpd",
 		Ports: []int{port},
 		Env: []string{
@@ -138,44 +89,7 @@ func runHttpd(b *testing.B, clientMachine, serverMachine harness.Machine, hey *t
 			"APACHE_LOG_DIR=/tmp",
 			"APACHE_PID_FILE=/tmp/apache.pid",
 		},
-	}, "sh", "-c", cmd); err != nil {
-		b.Fatalf("failed to start server: %v", err)
-	}
-
-	ip, err := serverMachine.IPAddress()
-	if err != nil {
-		b.Fatalf("failed to find server ip: %v", err)
-	}
-
-	servingPort, err := server.FindPort(ctx, port)
-	if err != nil {
-		b.Fatalf("failed to find server port %d: %v", port, err)
-	}
-
-	// Check the server is serving.
-	harness.WaitUntilServing(ctx, clientMachine, ip, servingPort)
-
-	var client *dockerutil.Container
-	// Grab a client.
-	if reverse {
-		client = clientMachine.GetContainer(ctx, b)
-	} else {
-		client = clientMachine.GetNativeContainer(ctx, b)
-	}
-	defer client.CleanUp(ctx)
-
-	b.ResetTimer()
-	server.RestartProfiles()
-	for i := 0; i < b.N; i++ {
-		out, err := client.Run(ctx, dockerutil.RunOpts{
-			Image: "benchmarks/hey",
-		}, hey.MakeCmd(ip, servingPort)...)
-		if err != nil {
-			b.Fatalf("run failed with: %v", err)
-		}
-
-		b.StopTimer()
-		hey.Report(b, out)
-		b.StartTimer()
 	}
+	httpdCmd := []string{"sh", "-c", "mkdir -p /tmp/html; cp -r /local/* /tmp/html/.; apache2 -X"}
+	runStaticServer(b, httpdRunOpts, httpdCmd, port, hey, reverse)
 }
diff --git a/test/benchmarks/network/nginx_test.go b/test/benchmarks/network/nginx_test.go
index 2bf1a3624..08565d0b2 100644
--- a/test/benchmarks/network/nginx_test.go
+++ b/test/benchmarks/network/nginx_test.go
@@ -14,91 +14,90 @@
 package network
 
 import (
-	"context"
-	"fmt"
+	"strconv"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
-	"gvisor.dev/gvisor/test/benchmarks/harness"
 	"gvisor.dev/gvisor/test/benchmarks/tools"
 )
 
-// BenchmarkNginxConcurrency iterates the concurrency argument and tests
-// how well the runtime under test handles requests in parallel.
-// TODO(gvisor.dev/issue/3536): Update with different doc sizes like Httpd.
-func BenchmarkNginxConcurrency(b *testing.B) {
-	// Grab a machine for the client and server.
-	clientMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get client: %v", err)
-	}
-	defer clientMachine.CleanUp()
+// see Dockerfile '//images/benchmarks/nginx'.
+var nginxDocs = map[string]string{
+	"notfound": "notfound",
+	"1Kb":      "latin1k.txt",
+	"10Kb":     "latin10k.txt",
+	"100Kb":    "latin100k.txt",
+	"1Mb":      "latin1024k.txt",
+	"10Mb":     "latin10240k.txt",
+}
 
-	serverMachine, err := h.GetMachine()
-	if err != nil {
-		b.Fatalf("failed to get server: %v", err)
-	}
-	defer serverMachine.CleanUp()
+// BenchmarkNginxDocSize iterates over different sized payloads, testing how
+// well the runtime handles sending different payload sizes.
+func BenchmarkNginxDocSize(b *testing.B) {
+	benchmarkNginxDocSize(b, false /* reverse */, true /* tmpfs */)
+	benchmarkNginxDocSize(b, false /* reverse */, false /* tmpfs */)
+}
 
-	concurrency := []int{1, 5, 10, 25}
-	for _, c := range concurrency {
-		b.Run(fmt.Sprintf("%d", c), func(b *testing.B) {
-			hey := &tools.Hey{
-				Requests:    10000,
-				Concurrency: c,
-			}
-			runNginx(b, clientMachine, serverMachine, hey)
-		})
-	}
+// BenchmarkReverseNginxDocSize iterates over different sized payloads, testing
+// how well the runtime handles receiving different payload sizes.
+func BenchmarkReverseNginxDocSize(b *testing.B) {
+	benchmarkNginxDocSize(b, true /* reverse */, true /* tmpfs */)
 }
 
-// runHttpd runs a single test run.
-func runNginx(b *testing.B, clientMachine, serverMachine harness.Machine, hey *tools.Hey) {
-	b.Helper()
+// benchmarkNginxDocSize iterates through all doc sizes, running subbenchmarks
+// for each size.
+func benchmarkNginxDocSize(b *testing.B, reverse, tmpfs bool) {
+	for size, filename := range nginxDocs {
+		concurrency := []int{1, 25, 50, 100, 1000}
+		for _, c := range concurrency {
+			fsize := tools.Parameter{
+				Name:  "filesize",
+				Value: size,
+			}
 
-	// Grab a container from the server.
-	ctx := context.Background()
-	server := serverMachine.GetContainer(ctx, b)
-	defer server.CleanUp(ctx)
+			threads := tools.Parameter{
+				Name:  "concurrency",
+				Value: strconv.Itoa(c),
+			}
 
-	port := 80
-	// Start the server.
-	if err := server.Spawn(ctx,
-		dockerutil.RunOpts{
-			Image: "benchmarks/nginx",
-			Ports: []int{port},
-		}); err != nil {
-		b.Fatalf("server failed to start: %v", err)
-	}
+			fs := tools.Parameter{
+				Name:  "filesystem",
+				Value: "bind",
+			}
+			if tmpfs {
+				fs.Value = "tmpfs"
+			}
+			name, err := tools.ParametersToName(fsize, threads, fs)
+			if err != nil {
+				b.Fatalf("Failed to parse parameters: %v", err)
+			}
 
-	ip, err := serverMachine.IPAddress()
-	if err != nil {
-		b.Fatalf("failed to find server ip: %v", err)
+			b.Run(name, func(b *testing.B) {
+				hey := &tools.Hey{
+					Requests:    c * b.N,
+					Concurrency: c,
+					Doc:         filename,
+				}
+				runNginx(b, hey, reverse, tmpfs)
+			})
+		}
 	}
+}
 
-	servingPort, err := server.FindPort(ctx, port)
-	if err != nil {
-		b.Fatalf("failed to find server port %d: %v", port, err)
+// runNginx configures the static serving methods to run httpd.
+func runNginx(b *testing.B, hey *tools.Hey, reverse, tmpfs bool) {
+	// nginx runs on port 80.
+	port := 80
+	nginxRunOpts := dockerutil.RunOpts{
+		Image: "benchmarks/nginx",
+		Ports: []int{port},
 	}
 
-	// Check the server is serving.
-	harness.WaitUntilServing(ctx, clientMachine, ip, servingPort)
-
-	// Grab a client.
-	client := clientMachine.GetNativeContainer(ctx, b)
-	defer client.CleanUp(ctx)
-
-	b.ResetTimer()
-	server.RestartProfiles()
-	for i := 0; i < b.N; i++ {
-		out, err := client.Run(ctx, dockerutil.RunOpts{
-			Image: "benchmarks/hey",
-		}, hey.MakeCmd(ip, servingPort)...)
-		if err != nil {
-			b.Fatalf("run failed with: %v", err)
-		}
-		b.StopTimer()
-		hey.Report(b, out)
-		b.StartTimer()
+	nginxCmd := []string{"nginx", "-c", "/etc/nginx/nginx_gofer.conf"}
+	if tmpfs {
+		nginxCmd = []string{"sh", "-c", "mkdir -p /tmp/html && cp -a /local/* /tmp/html && nginx -c /etc/nginx/nginx.conf"}
 	}
+
+	// Command copies nginxDocs to tmpfs serving directory and runs nginx.
+	runStaticServer(b, nginxRunOpts, nginxCmd, port, hey, reverse)
 }
diff --git a/test/benchmarks/network/node_test.go b/test/benchmarks/network/node_test.go
index 52eb794c4..254538899 100644
--- a/test/benchmarks/network/node_test.go
+++ b/test/benchmarks/network/node_test.go
@@ -15,7 +15,7 @@ package network
 
 import (
 	"context"
-	"fmt"
+	"strconv"
 	"testing"
 	"time"
 
@@ -31,7 +31,15 @@ import (
 func BenchmarkNode(b *testing.B) {
 	concurrency := []int{1, 5, 10, 25}
 	for _, c := range concurrency {
-		b.Run(fmt.Sprintf("Concurrency%d", c), func(b *testing.B) {
+		param := tools.Parameter{
+			Name:  "concurrency",
+			Value: strconv.Itoa(c),
+		}
+		name, err := tools.ParametersToName(param)
+		if err != nil {
+			b.Fatalf("Failed to parse parameters: %v", err)
+		}
+		b.Run(name, func(b *testing.B) {
 			hey := &tools.Hey{
 				Requests:    b.N * c, // Requests b.N requests per thread.
 				Concurrency: c,
@@ -48,14 +56,14 @@ func runNode(b *testing.B, hey *tools.Hey) {
 	// The machine to hold Redis and the Node Server.
 	serverMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer serverMachine.CleanUp()
 
 	// The machine to run 'hey'.
 	clientMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer clientMachine.CleanUp()
 
diff --git a/test/benchmarks/network/ruby_test.go b/test/benchmarks/network/ruby_test.go
index 5e0b2b724..0174ff3f3 100644
--- a/test/benchmarks/network/ruby_test.go
+++ b/test/benchmarks/network/ruby_test.go
@@ -16,6 +16,7 @@ package network
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"testing"
 	"time"
 
@@ -31,7 +32,15 @@ import (
 func BenchmarkRuby(b *testing.B) {
 	concurrency := []int{1, 5, 10, 25}
 	for _, c := range concurrency {
-		b.Run(fmt.Sprintf("Concurrency%d", c), func(b *testing.B) {
+		param := tools.Parameter{
+			Name:  "concurrency",
+			Value: strconv.Itoa(c),
+		}
+		name, err := tools.ParametersToName(param)
+		if err != nil {
+			b.Fatalf("Failed to parse parameters: %v", err)
+		}
+		b.Run(name, func(b *testing.B) {
 			hey := &tools.Hey{
 				Requests:    b.N * c, // b.N requests per thread.
 				Concurrency: c,
@@ -47,14 +56,14 @@ func runRuby(b *testing.B, hey *tools.Hey) {
 	// The machine to hold Redis and the Ruby Server.
 	serverMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer serverMachine.CleanUp()
 
 	// The machine to run 'hey'.
 	clientMachine, err := h.GetMachine()
 	if err != nil {
-		b.Fatal("failed to get machine with: %v", err)
+		b.Fatalf("failed to get machine with: %v", err)
 	}
 	defer clientMachine.CleanUp()
 	ctx := context.Background()
diff --git a/test/benchmarks/network/static_server.go b/test/benchmarks/network/static_server.go
new file mode 100644
index 000000000..e747a1395
--- /dev/null
+++ b/test/benchmarks/network/static_server.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package network
+
+import (
+	"context"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/test/benchmarks/harness"
+	"gvisor.dev/gvisor/test/benchmarks/tools"
+)
+
+// runStaticServer runs static serving workloads (httpd, nginx).
+func runStaticServer(b *testing.B, serverOpts dockerutil.RunOpts, serverCmd []string, port int, hey *tools.Hey, reverse bool) {
+	ctx := context.Background()
+
+	// Get two machines: a client and server.
+	clientMachine, err := h.GetMachine()
+	if err != nil {
+		b.Fatalf("failed to get machine: %v", err)
+	}
+	defer clientMachine.CleanUp()
+
+	serverMachine, err := h.GetMachine()
+	if err != nil {
+		b.Fatalf("failed to get machine: %v", err)
+	}
+	defer serverMachine.CleanUp()
+
+	// Make the containers. 'reverse=true' specifies that the client should use the
+	// runtime under test.
+	var client, server *dockerutil.Container
+	if reverse {
+		client = clientMachine.GetContainer(ctx, b)
+		server = serverMachine.GetNativeContainer(ctx, b)
+	} else {
+		client = clientMachine.GetNativeContainer(ctx, b)
+		server = serverMachine.GetContainer(ctx, b)
+	}
+	defer client.CleanUp(ctx)
+	defer server.CleanUp(ctx)
+
+	// Start the server.
+	if err := server.Spawn(ctx, serverOpts, serverCmd...); err != nil {
+		b.Fatalf("failed to start server: %v", err)
+	}
+
+	// Get its IP.
+	ip, err := serverMachine.IPAddress()
+	if err != nil {
+		b.Fatalf("failed to find server ip: %v", err)
+	}
+
+	// Get the published port.
+	servingPort, err := server.FindPort(ctx, port)
+	if err != nil {
+		b.Fatalf("failed to find server port %d: %v", port, err)
+	}
+
+	// Make sure the server is serving.
+	harness.WaitUntilServing(ctx, clientMachine, ip, servingPort)
+	b.ResetTimer()
+	server.RestartProfiles()
+	out, err := client.Run(ctx, dockerutil.RunOpts{
+		Image: "benchmarks/hey",
+	}, hey.MakeCmd(ip, servingPort)...)
+	if err != nil {
+		b.Fatalf("run failed with: %v", err)
+	}
+
+	b.StopTimer()
+	hey.Report(b, out)
+	b.StartTimer()
+}
diff --git a/test/benchmarks/tcp/tcp_proxy.go b/test/benchmarks/tcp/tcp_proxy.go
index 4b7ca7a14..5afe10f69 100644
--- a/test/benchmarks/tcp/tcp_proxy.go
+++ b/test/benchmarks/tcp/tcp_proxy.go
@@ -174,8 +174,8 @@ func newNetstackImpl(mode string) (impl, error) {
 	}
 
 	// Create a new network stack.
-	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()}
-	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()}
+	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, arp.NewProtocol}
+	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol}
 	s := stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
@@ -228,19 +228,26 @@ func newNetstackImpl(mode string) (impl, error) {
 	})
 
 	// Set protocol options.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(*sack)
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(*moderateRecvBuf)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(*moderateRecvBuf)
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set Congestion Control to cubic if requested.
 	if *cubic {
-		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %s", err)
+		opt := tcpip.CongestionControlOption("cubic")
+		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%s)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
diff --git a/test/benchmarks/tools/BUILD b/test/benchmarks/tools/BUILD
index e5734d85c..9290830d7 100644
--- a/test/benchmarks/tools/BUILD
+++ b/test/benchmarks/tools/BUILD
@@ -4,12 +4,14 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tools",
+    testonly = 1,
     srcs = [
         "ab.go",
         "fio.go",
         "hey.go",
         "iperf.go",
         "meminfo.go",
+        "parser_util.go",
         "redis.go",
         "sysbench.go",
         "tools.go",
diff --git a/test/benchmarks/tools/ab.go b/test/benchmarks/tools/ab.go
index 4cc9c3bce..d9abf0763 100644
--- a/test/benchmarks/tools/ab.go
+++ b/test/benchmarks/tools/ab.go
@@ -46,18 +46,21 @@ func (a *ApacheBench) Report(b *testing.B, output string) {
 		b.Logf("failed to parse transferrate: %v", err)
 	}
 	b.ReportMetric(transferRate*1024, "transfer_rate_b/s") // Convert from Kb/s to b/s.
+	ReportCustomMetric(b, transferRate*1024, "transfer_rate" /*metric name*/, "bytes_per_second" /*unit*/)
 
 	latency, err := a.parseLatency(output)
 	if err != nil {
 		b.Logf("failed to parse latency: %v", err)
 	}
 	b.ReportMetric(latency/1000, "mean_latency_secs") // Convert from ms to s.
+	ReportCustomMetric(b, latency/1000, "mean_latency" /*metric name*/, "s" /*unit*/)
 
 	reqPerSecond, err := a.parseRequestsPerSecond(output)
 	if err != nil {
 		b.Logf("failed to parse requests per second: %v", err)
 	}
 	b.ReportMetric(reqPerSecond, "requests_per_second")
+	ReportCustomMetric(b, reqPerSecond, "requests_per_second" /*metric name*/, "QPS" /*unit*/)
 }
 
 var transferRateRE = regexp.MustCompile(`Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received`)
diff --git a/test/benchmarks/tools/fio.go b/test/benchmarks/tools/fio.go
index 20000db16..f5f60fa84 100644
--- a/test/benchmarks/tools/fio.go
+++ b/test/benchmarks/tools/fio.go
@@ -56,13 +56,13 @@ func (f *Fio) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("failed to parse bandwidth from %s with: %v", output, err)
 	}
-	b.ReportMetric(bw, "bandwidth_b/s") // in b/s.
+	ReportCustomMetric(b, bw, "bandwidth" /*metric name*/, "bytes_per_second" /*unit*/)
 
 	iops, err := f.parseIOps(output, isRead)
 	if err != nil {
 		b.Fatalf("failed to parse iops from %s with: %v", output, err)
 	}
-	b.ReportMetric(iops, "iops")
+	ReportCustomMetric(b, iops, "io_ops" /*metric name*/, "ops_per_second" /*unit*/)
 }
 
 // parseBandwidth reports the bandwidth in b/s.
diff --git a/test/benchmarks/tools/hey.go b/test/benchmarks/tools/hey.go
index b1e20e356..b8cb938fe 100644
--- a/test/benchmarks/tools/hey.go
+++ b/test/benchmarks/tools/hey.go
@@ -43,13 +43,13 @@ func (h *Hey) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("failed to parse requests per second: %v", err)
 	}
-	b.ReportMetric(requests, "requests_per_second")
+	ReportCustomMetric(b, requests, "requests_per_second" /*metric name*/, "QPS" /*unit*/)
 
 	ave, err := h.parseAverageLatency(output)
 	if err != nil {
 		b.Fatalf("failed to parse average latency: %v", err)
 	}
-	b.ReportMetric(ave, "average_latency_secs")
+	ReportCustomMetric(b, ave, "average_latency" /*metric name*/, "s" /*unit*/)
 }
 
 var heyReqPerSecondRE = regexp.MustCompile(`Requests/sec:\s*(\d+\.?\d+?)\s+`)
diff --git a/test/benchmarks/tools/iperf.go b/test/benchmarks/tools/iperf.go
index df3d9349b..5c4e7125b 100644
--- a/test/benchmarks/tools/iperf.go
+++ b/test/benchmarks/tools/iperf.go
@@ -42,7 +42,7 @@ func (i *Iperf) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("failed to parse bandwitdth from %s: %v", output, err)
 	}
-	b.ReportMetric(bW*1024, "bandwidth_b/s") // Convert from Kb/s to b/s.
+	ReportCustomMetric(b, bW*1024, "bandwidth" /*metric name*/, "bytes_per_second" /*unit*/)
 }
 
 // bandwidth parses the Bandwidth number from an iperf report. A sample is below.
diff --git a/test/benchmarks/tools/meminfo.go b/test/benchmarks/tools/meminfo.go
index 2414a96a7..b5786fe11 100644
--- a/test/benchmarks/tools/meminfo.go
+++ b/test/benchmarks/tools/meminfo.go
@@ -45,7 +45,7 @@ func (*Meminfo) Report(b *testing.B, before, after string) {
 		b.Fatalf("could not parse before value %s: %v", before, err)
 	}
 	val := 1024 * ((beforeVal - afterVal) / float64(b.N))
-	b.ReportMetric(val, "average_container_size_bytes")
+	ReportCustomMetric(b, val, "average_container_size" /*metric name*/, "bytes" /*units*/)
 }
 
 var memInfoRE = regexp.MustCompile(`MemAvailable:\s*(\d+)\skB\n`)
diff --git a/test/benchmarks/tools/parser_util.go b/test/benchmarks/tools/parser_util.go
new file mode 100644
index 000000000..a4555c7dd
--- /dev/null
+++ b/test/benchmarks/tools/parser_util.go
@@ -0,0 +1,101 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tools
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"testing"
+)
+
+// Parameter is a test parameter.
+type Parameter struct {
+	Name  string
+	Value string
+}
+
+// Output is parsed and split by these values. Make them illegal in input methods.
+// We are constrained on what characters these can be by 1) docker's allowable
+// container names, 2) golang allowable benchmark names, and 3) golangs allowable
+// charecters in b.ReportMetric calls.
+var illegalChars = regexp.MustCompile(`[/\.]`)
+
+// ParametersToName joins parameters into a string format for parsing.
+// It is meant to be used for t.Run() calls in benchmark tools.
+func ParametersToName(params ...Parameter) (string, error) {
+	var strs []string
+	for _, param := range params {
+		if illegalChars.MatchString(param.Name) || illegalChars.MatchString(param.Value) {
+			return "", fmt.Errorf("params Name: %q and Value: %q cannot container '.' or '/'", param.Name, param.Value)
+		}
+		strs = append(strs, strings.Join([]string{param.Name, param.Value}, "."))
+	}
+	return strings.Join(strs, "/"), nil
+}
+
+// NameToParameters parses the string created by ParametersToName and returns
+// it as a set of Parameters.
+// Example: BenchmarkRuby/server_threads.1/doc_size.16KB-6
+// The parameter part of this benchmark is:
+// "server_threads.1/doc_size.16KB" (BenchmarkRuby is the name, and 6 is GOMAXPROCS)
+// This function will return a slice with two parameters ->
+// {Name: server_threads, Value: 1}, {Name: doc_size, Value: 16KB}
+func NameToParameters(name string) ([]*Parameter, error) {
+	var params []*Parameter
+	for _, cond := range strings.Split(name, "/") {
+		cs := strings.Split(cond, ".")
+		switch len(cs) {
+		case 1:
+			params = append(params, &Parameter{Name: cond, Value: cond})
+		case 2:
+			params = append(params, &Parameter{Name: cs[0], Value: cs[1]})
+		default:
+			return nil, fmt.Errorf("failed to parse param: %s", cond)
+		}
+	}
+	return params, nil
+}
+
+// ReportCustomMetric reports a metric in a set format for parsing.
+func ReportCustomMetric(b *testing.B, value float64, name, unit string) {
+	if illegalChars.MatchString(name) || illegalChars.MatchString(unit) {
+		b.Fatalf("name: %q and unit: %q cannot contain '/' or '.'", name, unit)
+	}
+	nameUnit := strings.Join([]string{name, unit}, ".")
+	b.ReportMetric(value, nameUnit)
+}
+
+// Metric holds metric data parsed from a string based on the format
+// ReportMetric.
+type Metric struct {
+	Name   string
+	Unit   string
+	Sample float64
+}
+
+// ParseCustomMetric parses a metric reported with ReportCustomMetric.
+func ParseCustomMetric(value, metric string) (*Metric, error) {
+	sample, err := strconv.ParseFloat(value, 64)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse value: %v", err)
+	}
+	nameUnit := strings.Split(metric, ".")
+	if len(nameUnit) != 2 {
+		return nil, fmt.Errorf("failed to parse metric: %s", metric)
+	}
+	return &Metric{Name: nameUnit[0], Unit: nameUnit[1], Sample: sample}, nil
+}
diff --git a/test/benchmarks/tools/redis.go b/test/benchmarks/tools/redis.go
index c899ae0d4..e35886437 100644
--- a/test/benchmarks/tools/redis.go
+++ b/test/benchmarks/tools/redis.go
@@ -49,7 +49,7 @@ func (r *Redis) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("parsing result %s failed with err: %v", output, err)
 	}
-	b.ReportMetric(result, r.Operation) // operations per second
+	ReportCustomMetric(b, result, r.Operation /*metric_name*/, "QPS" /*unit*/)
 }
 
 // parseOperation grabs the metric operations per second from redis-benchmark output.
diff --git a/test/benchmarks/tools/sysbench.go b/test/benchmarks/tools/sysbench.go
index 6b2f75ca2..7ccacd8ff 100644
--- a/test/benchmarks/tools/sysbench.go
+++ b/test/benchmarks/tools/sysbench.go
@@ -80,7 +80,7 @@ func (s *SysbenchCPU) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("parsing CPU events from %s failed: %v", output, err)
 	}
-	b.ReportMetric(result, "cpu_events_per_second")
+	ReportCustomMetric(b, result, "cpu_events" /*metric name*/, "events_per_second" /*unit*/)
 }
 
 var cpuEventsPerSecondRE = regexp.MustCompile(`events per second:\s*(\d*.?\d*)\n`)
@@ -144,7 +144,7 @@ func (s *SysbenchMemory) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("parsing result %s failed with err: %v", output, err)
 	}
-	b.ReportMetric(result, "operations_per_second")
+	ReportCustomMetric(b, result, "memory_operations" /*metric name*/, "ops_per_second" /*unit*/)
 }
 
 var memoryOperationsRE = regexp.MustCompile(`Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)`)
@@ -198,19 +198,19 @@ func (s *SysbenchMutex) Report(b *testing.B, output string) {
 	if err != nil {
 		b.Fatalf("parsing result %s failed with err: %v", output, err)
 	}
-	b.ReportMetric(result, "average_execution_time_secs")
+	ReportCustomMetric(b, result, "average_execution_time" /*metric name*/, "s" /*unit*/)
 
 	result, err = s.parseDeviation(output)
 	if err != nil {
 		b.Fatalf("parsing result %s failed with err: %v", output, err)
 	}
-	b.ReportMetric(result, "stdev_execution_time_secs")
+	ReportCustomMetric(b, result, "stddev_execution_time" /*metric name*/, "s" /*unit*/)
 
 	result, err = s.parseLatency(output)
 	if err != nil {
 		b.Fatalf("parsing result %s failed with err: %v", output, err)
 	}
-	b.ReportMetric(result/1000, "average_latency_secs")
+	ReportCustomMetric(b, result/1000, "average_latency" /*metric name*/, "s" /*unit*/)
 }
 
 var executionTimeRE = regexp.MustCompile(`execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)`)
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 809244bab..8425abecb 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -64,9 +64,10 @@ func TestLifeCycle(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the container.
+	port := 80
 	if err := d.Create(ctx, dockerutil.RunOpts{
 		Image: "basic/nginx",
-		Ports: []int{80},
+		Ports: []int{port},
 	}); err != nil {
 		t.Fatalf("docker create failed: %v", err)
 	}
@@ -74,16 +75,15 @@ func TestLifeCycle(t *testing.T) {
 		t.Fatalf("docker start failed: %v", err)
 	}
 
-	// Test that container is working.
-	port, err := d.FindPort(ctx, 80)
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 	client := http.Client{Timeout: defaultWait}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+	if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
 		t.Errorf("http request failed: %v", err)
 	}
 
@@ -105,27 +105,28 @@ func TestPauseResume(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the container.
+	port := 8080
 	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/python",
-		Ports: []int{8080}, // See Dockerfile.
+		Ports: []int{port}, // See Dockerfile.
 	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(ctx, 8080)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check that container is working.
 	client := http.Client{Timeout: defaultWait}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+	if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
 		t.Error("http request failed:", err)
 	}
 
@@ -135,7 +136,7 @@ func TestPauseResume(t *testing.T) {
 
 	// Check if container is paused.
 	client = http.Client{Timeout: 10 * time.Millisecond} // Don't wait a minute.
-	switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) {
+	switch _, err := client.Get(fmt.Sprintf("http://%s:%d", ip.String(), port)); v := err.(type) {
 	case nil:
 		t.Errorf("http req expected to fail but it succeeded")
 	case net.Error:
@@ -151,13 +152,13 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
 	client = http.Client{Timeout: defaultWait}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+	if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
 		t.Error("http request failed:", err)
 	}
 }
@@ -179,9 +180,10 @@ func TestCheckpointRestore(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the container.
+	port := 8080
 	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/python",
-		Ports: []int{8080}, // See Dockerfile.
+		Ports: []int{port}, // See Dockerfile.
 	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
@@ -199,20 +201,20 @@ func TestCheckpointRestore(t *testing.T) {
 		t.Fatalf("docker restore failed: %v", err)
 	}
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(ctx, 8080)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
 	client := http.Client{Timeout: defaultWait}
-	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+	if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
 		t.Error("http request failed:", err)
 	}
 }
diff --git a/test/fuse/BUILD b/test/fuse/BUILD
index 56157c96b..8e31fdd41 100644
--- a/test/fuse/BUILD
+++ b/test/fuse/BUILD
@@ -5,5 +5,69 @@ package(licenses = ["notice"])
 syscall_test(
     fuse = "True",
     test = "//test/fuse/linux:stat_test",
-    vfs2 = "True",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:open_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:release_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:mknod_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:symlink_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:readlink_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:mkdir_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:read_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:write_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:rmdir_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:readdir_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:create_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:unlink_test",
+)
+
+syscall_test(
+    fuse = "True",
+    test = "//test/fuse/linux:setstat_test",
 )
diff --git a/test/fuse/README.md b/test/fuse/README.md
index 734c3a4e3..65add57e2 100644
--- a/test/fuse/README.md
+++ b/test/fuse/README.md
@@ -1,65 +1,114 @@
 # gVisor FUSE Test Suite
 
-This is an integration test suite for fuse(4) filesystem. It runs under both
-gVisor and Linux, and ensures compatibility between the two. This test suite is
-based on system calls test.
+This is an integration test suite for fuse(4) filesystem. It runs under gVisor
+sandbox container with VFS2 and FUSE function enabled.
 
-This document describes the framework of fuse integration test and the
-guidelines that should be followed when adding new fuse tests.
+This document describes the framework of FUSE integration test, how to use it,
+and the guidelines that should be followed when adding new testing features.
 
 ## Integration Test Framework
 
-Please refer to the figure below. `>` is entering the function, `<` is leaving
-the function, and `=` indicates sequentially entering and leaving.
+By inheriting the `FuseTest` class defined in `linux/fuse_base.h`, every test
+fixture can run in an environment with `mount_point_` mounted by a fake FUSE
+server. It creates a `socketpair(2)` to send and receive control commands and
+data between the client and the server. Because the FUSE server runs in the
+background thread, gTest cannot catch its assertion failure immediately. Thus,
+`TearDown()` function sends command to the FUSE server to check if all gTest
+assertion in the server are successful and all requests and preset responses are
+consumed.
+
+## Communication Diagram
+
+Diagram below describes how a testing thread communicates with the FUSE server
+to achieve integration test.
+
+For the following diagram, `>` means entering the function, `<` is leaving the
+function, and `=` indicates sequentially entering and leaving. Not necessarily
+follow exactly the below diagram due to the nature of a multi-threaded system,
+however, it is still helpful to know when the client waits for the server to
+complete a command and when the server awaits the next instruction.
 
 ```
- |  Client (Test Main Process)         |  Server (FUSE Daemon)
- |                                     |
- |  >TEST_F()                          |
- |    >SetUp()                         |
- |      =MountFuse()                   |
- |      >SetUpFuseServer()             |
- |        [create communication pipes] |
- |        =fork()                      |        =fork()
- |        >WaitCompleted()             |
- |          [wait for MarkDone()]      |
- |                                     |        =ConsumeFuseInit()
- |                                     |        =MarkDone()
- |        <WaitCompleted()             |
- |      <SetUpFuseServer()             |
- |    <SetUp()                         |
- |    >SetExpected()                   |
- |      [construct expected reaction]  |
- |                                     |        >FuseLoop()
- |                                     |          >ReceiveExpected()
- |                                     |            [wait data from pipe]
- |      [write data to pipe]           |
- |      [wait for MarkDone()]          |
- |                                     |            [save data to memory]
- |                                     |            =MarkDone()
- |    <SetExpected()                   |
- |                                     |          <ReceiveExpected()
- |                                     |          >read()
- |                                     |            [wait for fs operation]
- |    >[Do fs operation]               |
- |      [wait for fs response]         |
- |                                     |          <read()
- |                                     |          =CompareRequest()
- |                                     |          =write() [write fs response]
- |    <[Do fs operation]               |
- |    =[Test fs operation result]      |
- |    =[wait for MarkDone()]           |
- |                                     |          =MarkDone()
- |    >TearDown()                      |
- |      =UnmountFuse()                 |
- |    <TearDown()                      |
- |  <TEST_F()                          |
+|  Client (Testing Thread)            |  Server (FUSE Server Thread)
+|                                     |
+|  >TEST_F()                          |
+|    >SetUp()                         |
+|      =MountFuse()                   |
+|      >SetUpFuseServer()             |
+|        [create communication socket]|
+|        =fork()                      |      =fork()
+|        [wait server complete]       |
+|                                     |      =ServerConsumeFuseInit()
+|                                     |      =ServerCompleteWith()
+|      <SetUpFuseServer()             |
+|    <SetUp()                         |
+|    [testing main]                   |
+|                                     |      >ServerFuseLoop()
+|                                     |        [poll on socket and fd]
+|    >SetServerResponse()             |
+|      [write data to socket]         |
+|      [wait server complete]         |
+|                                     |        [socket event occurs]
+|                                     |        >ServerHandleCommand()
+|                                     |          >ServerReceiveResponse()
+|                                     |            [read data from socket]
+|                                     |            [save data to memory]
+|                                     |          <ServerReceiveResponse()
+|                                     |          =ServerCompleteWith()
+|    <SetServerResponse()             |
+|                                     |        <ServerHandleCommand()
+|    >[Do fs operation]               |
+|      [wait for fs response]         |
+|                                     |        [fd event occurs]
+|                                     |        >ServerProcessFuseRequest()
+|                                     |          =[read fs request]
+|                                     |          =[save fs request to memory]
+|                                     |          =[write fs response]
+|    <[Do fs operation]               |
+|                                     |        <ServerProcessFuseRequest()
+|                                     |
+|    =[Test fs operation result]      |
+|                                     |
+|    >GetServerActualRequest()        |
+|      [write data to socket]         |
+|      [wait data from server]        |
+|                                     |        [socket event occurs]
+|                                     |        >ServerHandleCommand()
+|                                     |          >ServerSendReceivedRequest()
+|                                     |            [write data to socket]
+|      [read data from socket]        |
+|      [wait server complete]         |
+|                                     |          <ServerSendReceivedRequest()
+|                                     |          =ServerCompleteWith()
+|    <GetServerActualRequest()        |
+|                                     |        <ServerHandleCommand()
+|                                     |
+|    =[Test actual request]           |
+|                                     |
+|    >TearDown()                      |
+|      ...                            |
+|      >GetServerNumUnsentResponses() |
+|        [write data to socket]       |
+|        [wait server complete]       |
+|                                     |        [socket event arrive]
+|                                     |        >ServerHandleCommand()
+|                                     |          >ServerSendData()
+|                                     |            [write data to socket]
+|                                     |          <ServerSendData()
+|                                     |          =ServerCompleteWith()
+|        [read data from socket]      |
+|        [test if all succeeded]      |
+|      <GetServerNumUnsentResponses() |
+|                                     |        <ServerHandleCommand()
+|      =UnmountFuse()                 |
+|    <TearDown()                      |
+|  <TEST_F()                          |
 ```
 
 ## Running the tests
 
-Based on syscall tests, fuse tests can run in different environments. To enable
-fuse testing environment, the test targets should be appended with `_fuse`.
+Based on syscall tests, FUSE tests generate targets only with vfs2 and fuse
+enabled. The corresponding targets end in `_fuse`.
 
 For example, to run fuse test in `stat_test.cc`:
 
@@ -77,17 +126,16 @@ $ bazel test --test_tag_filters=fuse //test/fuse/...
 
 1.  Add test targets in `BUILD` and `linux/BUILD`.
 2.  Inherit your test from `FuseTest` base class. It allows you to:
-    -   Run a fake FUSE server in background during each test setup.
-    -   Create pipes for communication and provide utility functions.
-    -   Stop FUSE server after test completes.
-3.  Customize your comparison function for request assessment in FUSE server.
-4.  Add the mapping of the size of structs if you are working on new FUSE
-    opcode.
-    -   Please update `FuseTest::GetPayloadSize()` for each new FUSE opcode.
-5.  Build the expected request-response pair of your FUSE operation.
-6.  Call `SetExpected()` function to inject the expected reaction.
-7.  Check the response and/or errors.
-8.  Finally call `WaitCompleted()` to ensure the FUSE server acts correctly.
+    -   Fork a fake FUSE server in background during each test setup.
+    -   Create a pair of sockets for communication and provide utility
+        functions.
+    -   Stop FUSE server and check if error occurs in it after test completes.
+3.  Build the expected opcode-response pairs of your FUSE operation.
+4.  Call `SetServerResponse()` to preset the next expected opcode and response.
+5.  Do real filesystem operations (FUSE is mounted at `mount_point_`).
+6.  Check FUSE response and/or errors.
+7.  Retrieve FUSE request by `GetServerActualRequest()`.
+8.  Check if the request is as expected.
 
 A few customized matchers used in syscalls test are encouraged to test the
 outcome of filesystem operations. Such as:
@@ -101,3 +149,40 @@ SyscallFailsWithErrno(...)
 
 Please refer to [test/syscalls/README.md](../syscalls/README.md) for further
 details.
+
+## Writing a new FuseTestCmd
+
+A `FuseTestCmd` is a control protocol used in the communication between the
+testing thread and the FUSE server. Such commands are sent from the testing
+thread to the FUSE server to set up, control, or inspect the behavior of the
+FUSE server in response to a sequence of FUSE requests.
+
+The lifecycle of a command contains following steps:
+
+1.  The testing thread sends a `FuseTestCmd` via socket and waits for
+    completion.
+2.  The FUSE server receives the command and does corresponding action.
+3.  (Optional) The testing thread reads data from socket.
+4.  The FUSE server sends a success indicator via socket after processing.
+5.  The testing thread gets the success signal and continues testing.
+
+The success indicator, i.e. `WaitServerComplete()`, is crucial at the end of
+each `FuseTestCmd` sent from the testing thread. Because we don't want to begin
+filesystem operation if the requests have not been completely set up. Also, to
+test FUSE interactions in a sequential manner, concurrent requests are not
+supported now.
+
+To add a new `FuseTestCmd`, one must comply with following format:
+
+1.  Add a new `FuseTestCmd` enum class item defined in `linux/fuse_base.h`
+2.  Add a `SetServerXXX()` or `GetServerXXX()` public function in `FuseTest`.
+    This is how the testing thread will call to send control message. Define how
+    many bytes you want to send along with the command and what you will expect
+    to receive. Finally it should block and wait for a success indicator from
+    the FUSE server.
+3.  Add a handler logic in the switch condition of `ServerHandleCommand()`. Use
+    `ServerSendData()` or declare a new private function such as
+    `ServerReceiveXXX()` or `ServerSendXXX()`. It is mandatory to set it private
+    since only the FUSE server (forked from `FuseTest` base class) can call it.
+    This is the server part of the specific `FuseTestCmd` and the format of the
+    data should be consistent with what the client expects in the previous step.
diff --git a/test/fuse/linux/BUILD b/test/fuse/linux/BUILD
index 4871bb531..7673252ec 100644
--- a/test/fuse/linux/BUILD
+++ b/test/fuse/linux/BUILD
@@ -11,7 +11,134 @@ cc_binary(
     srcs = ["stat_test.cc"],
     deps = [
         gtest,
+        ":fuse_fd_util",
+        "//test/util:cleanup",
+        "//test/util:fs_util",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "open_test",
+    testonly = 1,
+    srcs = ["open_test.cc"],
+    deps = [
+        gtest,
         ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "release_test",
+    testonly = 1,
+    srcs = ["release_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "mknod_test",
+    testonly = 1,
+    srcs = ["mknod_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:temp_umask",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "symlink_test",
+    testonly = 1,
+    srcs = ["symlink_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "readlink_test",
+    testonly = 1,
+    srcs = ["readlink_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "mkdir_test",
+    testonly = 1,
+    srcs = ["mkdir_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:temp_umask",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "setstat_test",
+    testonly = 1,
+    srcs = ["setstat_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_fd_util",
+        "//test/util:cleanup",
+        "//test/util:fs_util",
+        "//test/util:fuse_util",
+        "//test/util:temp_umask",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "rmdir_test",
+    testonly = 1,
+    srcs = ["rmdir_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fs_util",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "readdir_test",
+    testonly = 1,
+    srcs = ["readdir_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fs_util",
+        "//test/util:fuse_util",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -24,9 +151,80 @@ cc_library(
     hdrs = ["fuse_base.h"],
     deps = [
         gtest,
+        "//test/util:fuse_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
+
+cc_library(
+    name = "fuse_fd_util",
+    testonly = 1,
+    srcs = ["fuse_fd_util.cc"],
+    hdrs = ["fuse_fd_util.h"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:fuse_util",
+        "//test/util:posix_error",
+    ],
+)
+
+cc_binary(
+    name = "read_test",
+    testonly = 1,
+    srcs = ["read_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "write_test",
+    testonly = 1,
+    srcs = ["write_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "create_test",
+    testonly = 1,
+    srcs = ["create_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fs_util",
+        "//test/util:fuse_util",
+        "//test/util:temp_umask",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "unlink_test",
+    testonly = 1,
+    srcs = ["unlink_test.cc"],
+    deps = [
+        gtest,
+        ":fuse_base",
+        "//test/util:fuse_util",
+        "//test/util:temp_umask",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
diff --git a/test/fuse/linux/create_test.cc b/test/fuse/linux/create_test.cc
new file mode 100644
index 000000000..9a0219a58
--- /dev/null
+++ b/test/fuse/linux/create_test.cc
@@ -0,0 +1,128 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fs_util.h"
+#include "test/util/fuse_util.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class CreateTest : public FuseTest {
+ protected:
+  const std::string test_file_name_ = "test_file";
+  const mode_t mode = S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(CreateTest, CreateFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_name_);
+
+  // Ensure the file doesn't exist.
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header),
+      .error = -ENOENT,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header);
+  SetServerResponse(FUSE_LOOKUP, iov_out);
+
+  // creat(2) is equal to open(2) with open_flags O_CREAT | O_WRONLY | O_TRUNC.
+  const mode_t new_mask = S_IWGRP | S_IWOTH;
+  const int open_flags = O_CREAT | O_WRONLY | O_TRUNC;
+  out_header.error = 0;
+  out_header.len = sizeof(struct fuse_out_header) +
+                   sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out);
+  struct fuse_entry_out entry_payload = DefaultEntryOut(mode & ~new_mask, 2);
+  struct fuse_open_out out_payload = {
+      .fh = 1,
+      .open_flags = open_flags,
+  };
+  iov_out = FuseGenerateIovecs(out_header, entry_payload, out_payload);
+  SetServerResponse(FUSE_CREATE, iov_out);
+
+  // kernfs generates a successive FUSE_OPEN after the file is created. Linux's
+  // fuse kernel module will not send this FUSE_OPEN after creat(2).
+  out_header.len =
+      sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out);
+  iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_OPEN, iov_out);
+
+  int fd;
+  TempUmask mask(new_mask);
+  EXPECT_THAT(fd = creat(test_file_path.c_str(), mode), SyscallSucceeds());
+  EXPECT_THAT(fcntl(fd, F_GETFL),
+              SyscallSucceedsWithValue(open_flags & O_ACCMODE));
+
+  struct fuse_in_header in_header;
+  struct fuse_create_in in_payload;
+  std::vector<char> name(test_file_name_.size() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload, name);
+
+  // Skip the request of FUSE_LOOKUP.
+  SkipServerActualRequest();
+
+  // Get the first FUSE_CREATE.
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload) +
+                               test_file_name_.size() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_CREATE);
+  EXPECT_EQ(in_payload.flags, open_flags);
+  EXPECT_EQ(in_payload.mode, mode & ~new_mask);
+  EXPECT_EQ(in_payload.umask, new_mask);
+  EXPECT_EQ(std::string(name.data()), test_file_name_);
+
+  // Get the successive FUSE_OPEN.
+  struct fuse_open_in in_payload_open;
+  iov_in = FuseGenerateIovecs(in_header, in_payload_open);
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload_open));
+  EXPECT_EQ(in_header.opcode, FUSE_OPEN);
+  EXPECT_EQ(in_payload_open.flags, open_flags & O_ACCMODE);
+
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+  // Skip the FUSE_RELEASE.
+  SkipServerActualRequest();
+}
+
+TEST_F(CreateTest, CreateFileAlreadyExists) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_name_);
+
+  const int open_flags = O_CREAT | O_EXCL;
+
+  SetServerInodeLookup(test_file_name_);
+
+  EXPECT_THAT(open(test_file_path.c_str(), mode, open_flags),
+              SyscallFailsWithErrno(EEXIST));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/fuse_base.cc b/test/fuse/linux/fuse_base.cc
index 9c3124472..5b45804e1 100644
--- a/test/fuse/linux/fuse_base.cc
+++ b/test/fuse/linux/fuse_base.cc
@@ -16,17 +16,17 @@
 
 #include <fcntl.h>
 #include <linux/fuse.h>
-#include <string.h>
+#include <poll.h>
 #include <sys/mount.h>
+#include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
-#include <iostream>
-
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "test/util/fuse_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -39,49 +39,123 @@ void FuseTest::SetUp() {
   SetUpFuseServer();
 }
 
-void FuseTest::TearDown() { UnmountFuse(); }
-
-// Since CompareRequest is running in background thread, gTest assertions and
-// expectations won't directly reflect the test result. However, the FUSE
-// background server still connects to the same standard I/O as testing main
-// thread. So EXPECT_XX can still be used to show different results. To
-// ensure failed testing result is observable, return false and the result
-// will be sent to test main thread via pipe.
-bool FuseTest::CompareRequest(void* expected_mem, size_t expected_len,
-                              void* real_mem, size_t real_len) {
-  if (expected_len != real_len) return false;
-  return memcmp(expected_mem, real_mem, expected_len) == 0;
+void FuseTest::TearDown() {
+  EXPECT_EQ(GetServerNumUnconsumedRequests(), 0);
+  EXPECT_EQ(GetServerNumUnsentResponses(), 0);
+  UnmountFuse();
 }
 
-// SetExpected is called by the testing main thread to set expected request-
-// response pair of a single FUSE operation.
-void FuseTest::SetExpected(struct iovec* iov_in, int iov_in_cnt,
-                           struct iovec* iov_out, int iov_out_cnt) {
-  EXPECT_THAT(RetryEINTR(writev)(set_expected_[1], iov_in, iov_in_cnt),
-              SyscallSucceedsWithValue(::testing::Gt(0)));
-  WaitCompleted();
+// Sends 3 parts of data to the FUSE server:
+//   1. The `kSetResponse` command
+//   2. The expected opcode
+//   3. The fake FUSE response
+// Then waits for the FUSE server to notify its completion.
+void FuseTest::SetServerResponse(uint32_t opcode,
+                                 std::vector<struct iovec>& iovecs) {
+  uint32_t cmd = static_cast<uint32_t>(FuseTestCmd::kSetResponse);
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &opcode, sizeof(opcode)),
+              SyscallSucceedsWithValue(sizeof(opcode)));
 
-  EXPECT_THAT(RetryEINTR(writev)(set_expected_[1], iov_out, iov_out_cnt),
-              SyscallSucceedsWithValue(::testing::Gt(0)));
-  WaitCompleted();
+  EXPECT_THAT(RetryEINTR(writev)(sock_[0], iovecs.data(), iovecs.size()),
+              SyscallSucceeds());
+
+  WaitServerComplete();
 }
 
-// WaitCompleted waits for the FUSE server to finish its job and check if it
+// Waits for the FUSE server to finish its blocking job and check if it
 // completes without errors.
-void FuseTest::WaitCompleted() {
-  char success;
-  EXPECT_THAT(RetryEINTR(read)(done_[0], &success, sizeof(success)),
-              SyscallSucceedsWithValue(1));
+void FuseTest::WaitServerComplete() {
+  uint32_t success;
+  EXPECT_THAT(RetryEINTR(read)(sock_[0], &success, sizeof(success)),
+              SyscallSucceedsWithValue(sizeof(success)));
+  ASSERT_EQ(success, 1);
+}
+
+// Sends the `kGetRequest` command to the FUSE server, then reads the next
+// request into iovec struct. The order of calling this function should be
+// the same as the one of SetServerResponse().
+void FuseTest::GetServerActualRequest(std::vector<struct iovec>& iovecs) {
+  uint32_t cmd = static_cast<uint32_t>(FuseTestCmd::kGetRequest);
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  EXPECT_THAT(RetryEINTR(readv)(sock_[0], iovecs.data(), iovecs.size()),
+              SyscallSucceeds());
+
+  WaitServerComplete();
+}
+
+// Sends a FuseTestCmd command to the FUSE server, reads from the socket, and
+// returns the corresponding data.
+uint32_t FuseTest::GetServerData(uint32_t cmd) {
+  uint32_t data;
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  EXPECT_THAT(RetryEINTR(read)(sock_[0], &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  WaitServerComplete();
+  return data;
+}
+
+uint32_t FuseTest::GetServerNumUnconsumedRequests() {
+  return GetServerData(
+      static_cast<uint32_t>(FuseTestCmd::kGetNumUnconsumedRequests));
+}
+
+uint32_t FuseTest::GetServerNumUnsentResponses() {
+  return GetServerData(
+      static_cast<uint32_t>(FuseTestCmd::kGetNumUnsentResponses));
+}
+
+uint32_t FuseTest::GetServerTotalReceivedBytes() {
+  return GetServerData(
+      static_cast<uint32_t>(FuseTestCmd::kGetTotalReceivedBytes));
+}
+
+// Sends the `kSkipRequest` command to the FUSE server, which would skip
+// current stored request data.
+void FuseTest::SkipServerActualRequest() {
+  uint32_t cmd = static_cast<uint32_t>(FuseTestCmd::kSkipRequest);
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  WaitServerComplete();
 }
 
-void FuseTest::MountFuse() {
+// Sends the `kSetInodeLookup` command, expected mode, and the path of the
+// inode to create under the mount point.
+void FuseTest::SetServerInodeLookup(const std::string& path, mode_t mode,
+                                    uint64_t size) {
+  uint32_t cmd = static_cast<uint32_t>(FuseTestCmd::kSetInodeLookup);
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &mode, sizeof(mode)),
+              SyscallSucceedsWithValue(sizeof(mode)));
+
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], &size, sizeof(size)),
+              SyscallSucceedsWithValue(sizeof(size)));
+
+  // Pad 1 byte for null-terminate c-string.
+  EXPECT_THAT(RetryEINTR(write)(sock_[0], path.c_str(), path.size() + 1),
+              SyscallSucceedsWithValue(path.size() + 1));
+
+  WaitServerComplete();
+}
+
+void FuseTest::MountFuse(const char* mountOpts) {
   EXPECT_THAT(dev_fd_ = open("/dev/fuse", O_RDWR), SyscallSucceeds());
 
-  std::string mount_opts = absl::StrFormat("fd=%d,%s", dev_fd_, kMountOpts);
+  std::string mount_opts = absl::StrFormat("fd=%d,%s", dev_fd_, mountOpts);
   mount_point_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(mount("fuse", mount_point_.path().c_str(), "fuse",
                     MS_NODEV | MS_NOSUID, mount_opts.c_str()),
-              SyscallSucceedsWithValue(0));
+              SyscallSucceeds());
 }
 
 void FuseTest::UnmountFuse() {
@@ -89,13 +163,13 @@ void FuseTest::UnmountFuse() {
   // TODO(gvisor.dev/issue/3330): ensure the process is terminated successfully.
 }
 
-// ConsumeFuseInit consumes the first FUSE request and returns the
-// corresponding PosixError.
-PosixError FuseTest::ConsumeFuseInit() {
+// Consumes the first FUSE request and returns the corresponding PosixError.
+PosixError FuseTest::ServerConsumeFuseInit(
+    const struct fuse_init_out* out_payload) {
+  std::vector<char> buf(FUSE_MIN_READ_BUFFER);
   RETURN_ERROR_IF_SYSCALL_FAIL(
-      RetryEINTR(read)(dev_fd_, buf_.data(), buf_.size()));
+      RetryEINTR(read)(dev_fd_, buf.data(), buf.size()));
 
-  struct iovec iov_out[2];
   struct fuse_out_header out_header = {
       .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_init_out),
       .error = 0,
@@ -103,72 +177,74 @@ PosixError FuseTest::ConsumeFuseInit() {
   };
   // Returns a fake fuse_init_out with 7.0 version to avoid ECONNREFUSED
   // error in the initialization of FUSE connection.
-  struct fuse_init_out out_payload = {
-      .major = 7,
-  };
-  iov_out[0].iov_len = sizeof(out_header);
-  iov_out[0].iov_base = &out_header;
-  iov_out[1].iov_len = sizeof(out_payload);
-  iov_out[1].iov_base = &out_payload;
+  auto iov_out = FuseGenerateIovecs(
+      out_header, *const_cast<struct fuse_init_out*>(out_payload));
 
-  RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(writev)(dev_fd_, iov_out, 2));
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      RetryEINTR(writev)(dev_fd_, iov_out.data(), iov_out.size()));
   return NoError();
 }
 
-// ReceiveExpected reads 1 pair of expected fuse request-response `iovec`s
-// from pipe and save them into member variables of this testing instance.
-void FuseTest::ReceiveExpected() {
-  // Set expected fuse_in request.
-  EXPECT_THAT(len_in_ = RetryEINTR(read)(set_expected_[0], mem_in_.data(),
-                                         mem_in_.size()),
-              SyscallSucceedsWithValue(::testing::Gt(0)));
-  MarkDone(len_in_ > 0);
+// Reads 1 expected opcode and a fake response from socket and save them into
+// the serial buffer of this testing instance.
+void FuseTest::ServerReceiveResponse() {
+  ssize_t len;
+  uint32_t opcode;
+  std::vector<char> buf(FUSE_MIN_READ_BUFFER);
+  EXPECT_THAT(RetryEINTR(read)(sock_[1], &opcode, sizeof(opcode)),
+              SyscallSucceedsWithValue(sizeof(opcode)));
 
-  // Set expected fuse_out response.
-  EXPECT_THAT(len_out_ = RetryEINTR(read)(set_expected_[0], mem_out_.data(),
-                                          mem_out_.size()),
-              SyscallSucceedsWithValue(::testing::Gt(0)));
-  MarkDone(len_out_ > 0);
+  EXPECT_THAT(len = RetryEINTR(read)(sock_[1], buf.data(), buf.size()),
+              SyscallSucceeds());
+
+  responses_.AddMemBlock(opcode, buf.data(), len);
 }
 
-// MarkDone writes 1 byte of success indicator through pipe.
-void FuseTest::MarkDone(bool success) {
-  char data = success ? 1 : 0;
-  EXPECT_THAT(RetryEINTR(write)(done_[1], &data, sizeof(data)),
-              SyscallSucceedsWithValue(1));
+// Writes 1 byte of success indicator through socket.
+void FuseTest::ServerCompleteWith(bool success) {
+  uint32_t data = success ? 1 : 0;
+  ServerSendData(data);
 }
 
-// FuseLoop is the implementation of the fake FUSE server. Read from /dev/fuse,
-// compare the request by CompareRequest (use derived function if specified),
-// and write the expected response to /dev/fuse.
-void FuseTest::FuseLoop() {
-  bool success = true;
-  ssize_t len = 0;
+// ServerFuseLoop is the implementation of the fake FUSE server. Monitors 2
+// file descriptors: /dev/fuse and sock_[1]. Events from /dev/fuse are FUSE
+// requests and events from sock_[1] are FUSE testing commands, leading by
+// a FuseTestCmd data to indicate the command.
+void FuseTest::ServerFuseLoop() {
+  const int nfds = 2;
+  struct pollfd fds[nfds] = {
+      {
+          .fd = dev_fd_,
+          .events = POLL_IN | POLLHUP | POLLERR | POLLNVAL,
+      },
+      {
+          .fd = sock_[1],
+          .events = POLL_IN | POLLHUP | POLLERR | POLLNVAL,
+      },
+  };
+
   while (true) {
-    ReceiveExpected();
+    ASSERT_THAT(poll(fds, nfds, -1), SyscallSucceeds());
 
-    EXPECT_THAT(len = RetryEINTR(read)(dev_fd_, buf_.data(), buf_.size()),
-                SyscallSucceedsWithValue(len_in_));
-    if (len != len_in_) success = false;
+    for (int fd_idx = 0; fd_idx < nfds; ++fd_idx) {
+      if (fds[fd_idx].revents == 0) continue;
 
-    if (!CompareRequest(buf_.data(), len_in_, mem_in_.data(), len_in_)) {
-      std::cerr << "the FUSE request is not expected" << std::endl;
-      success = false;
+      ASSERT_EQ(fds[fd_idx].revents, POLL_IN);
+      if (fds[fd_idx].fd == sock_[1]) {
+        ServerHandleCommand();
+      } else if (fds[fd_idx].fd == dev_fd_) {
+        ServerProcessFuseRequest();
+      }
     }
-
-    EXPECT_THAT(len = RetryEINTR(write)(dev_fd_, mem_out_.data(), len_out_),
-                SyscallSucceedsWithValue(len_out_));
-    if (len != len_out_) success = false;
-    MarkDone(success);
   }
 }
 
-// SetUpFuseServer creates 2 pipes. First is for testing client to send the
-// expected request-response pair, and the other acts as a checkpoint for the
-// FUSE server to notify the client that it can proceed.
-void FuseTest::SetUpFuseServer() {
-  ASSERT_THAT(pipe(set_expected_), SyscallSucceedsWithValue(0));
-  ASSERT_THAT(pipe(done_), SyscallSucceedsWithValue(0));
+// SetUpFuseServer creates 1 socketpair and fork the process. The parent thread
+// becomes testing thread and the child thread becomes the FUSE server running
+// in background. These 2 threads are connected via socketpair. sock_[0] is
+// opened in testing thread and sock_[1] is opened in the FUSE server.
+void FuseTest::SetUpFuseServer(const struct fuse_init_out* payload) {
+  ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_), SyscallSucceeds());
 
   switch (fork()) {
     case -1:
@@ -177,31 +253,194 @@ void FuseTest::SetUpFuseServer() {
     case 0:
       break;
     default:
-      ASSERT_THAT(close(set_expected_[0]), SyscallSucceedsWithValue(0));
-      ASSERT_THAT(close(done_[1]), SyscallSucceedsWithValue(0));
-      WaitCompleted();
+      ASSERT_THAT(close(sock_[1]), SyscallSucceeds());
+      WaitServerComplete();
       return;
   }
 
-  ASSERT_THAT(close(set_expected_[1]), SyscallSucceedsWithValue(0));
-  ASSERT_THAT(close(done_[0]), SyscallSucceedsWithValue(0));
-
-  MarkDone(ConsumeFuseInit().ok());
-
-  FuseLoop();
+  // Begin child thread, i.e. the FUSE server.
+  ASSERT_THAT(close(sock_[0]), SyscallSucceeds());
+  ServerCompleteWith(ServerConsumeFuseInit(payload).ok());
+  ServerFuseLoop();
   _exit(0);
 }
 
-// GetPayloadSize is a helper function to get the number of bytes of a
-// specific FUSE operation struct.
-size_t FuseTest::GetPayloadSize(uint32_t opcode, bool in) {
-  switch (opcode) {
-    case FUSE_INIT:
-      return in ? sizeof(struct fuse_init_in) : sizeof(struct fuse_init_out);
+void FuseTest::ServerSendData(uint32_t data) {
+  EXPECT_THAT(RetryEINTR(write)(sock_[1], &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+}
+
+// Reads FuseTestCmd sent from testing thread and routes to correct handler.
+// Since each command should be a blocking operation, a `ServerCompleteWith()`
+// is required after the switch keyword.
+void FuseTest::ServerHandleCommand() {
+  uint32_t cmd;
+  EXPECT_THAT(RetryEINTR(read)(sock_[1], &cmd, sizeof(cmd)),
+              SyscallSucceedsWithValue(sizeof(cmd)));
+
+  switch (static_cast<FuseTestCmd>(cmd)) {
+    case FuseTestCmd::kSetResponse:
+      ServerReceiveResponse();
+      break;
+    case FuseTestCmd::kSetInodeLookup:
+      ServerReceiveInodeLookup();
+      break;
+    case FuseTestCmd::kGetRequest:
+      ServerSendReceivedRequest();
+      break;
+    case FuseTestCmd::kGetTotalReceivedBytes:
+      ServerSendData(static_cast<uint32_t>(requests_.UsedBytes()));
+      break;
+    case FuseTestCmd::kGetNumUnconsumedRequests:
+      ServerSendData(static_cast<uint32_t>(requests_.RemainingBlocks()));
+      break;
+    case FuseTestCmd::kGetNumUnsentResponses:
+      ServerSendData(static_cast<uint32_t>(responses_.RemainingBlocks()));
+      break;
+    case FuseTestCmd::kSkipRequest:
+      ServerSkipReceivedRequest();
+      break;
     default:
+      FAIL() << "Unknown FuseTestCmd " << cmd;
       break;
   }
-  return 0;
+
+  ServerCompleteWith(!HasFailure());
+}
+
+// Reads the expected file mode and the path of one file. Crafts a basic
+// `fuse_entry_out` memory block and inserts into a map for future use.
+// The FUSE server will always return this response if a FUSE_LOOKUP
+// request with this specific path comes in.
+void FuseTest::ServerReceiveInodeLookup() {
+  mode_t mode;
+  uint64_t size;
+  std::vector<char> buf(FUSE_MIN_READ_BUFFER);
+
+  EXPECT_THAT(RetryEINTR(read)(sock_[1], &mode, sizeof(mode)),
+              SyscallSucceedsWithValue(sizeof(mode)));
+
+  EXPECT_THAT(RetryEINTR(read)(sock_[1], &size, sizeof(size)),
+              SyscallSucceedsWithValue(sizeof(size)));
+
+  EXPECT_THAT(RetryEINTR(read)(sock_[1], buf.data(), buf.size()),
+              SyscallSucceeds());
+
+  std::string path(buf.data());
+
+  uint32_t out_len =
+      sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out);
+  struct fuse_out_header out_header = {
+      .len = out_len,
+      .error = 0,
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(mode, nodeid_);
+  // Since this is only used in test, nodeid_ is simply increased by 1 to
+  // comply with the unqiueness of different path.
+  ++nodeid_;
+
+  // Set the size.
+  out_payload.attr.size = size;
+
+  memcpy(buf.data(), &out_header, sizeof(out_header));
+  memcpy(buf.data() + sizeof(out_header), &out_payload, sizeof(out_payload));
+  lookups_.AddMemBlock(FUSE_LOOKUP, buf.data(), out_len);
+  lookup_map_[path] = lookups_.Next();
+}
+
+// Sends the received request pointed by current cursor and advances cursor.
+void FuseTest::ServerSendReceivedRequest() {
+  if (requests_.End()) {
+    FAIL() << "No more received request.";
+    return;
+  }
+  auto mem_block = requests_.Next();
+  EXPECT_THAT(
+      RetryEINTR(write)(sock_[1], requests_.DataAtOffset(mem_block.offset),
+                        mem_block.len),
+      SyscallSucceedsWithValue(mem_block.len));
+}
+
+// Skip the request pointed by current cursor.
+void FuseTest::ServerSkipReceivedRequest() {
+  if (requests_.End()) {
+    FAIL() << "No more received request.";
+    return;
+  }
+  requests_.Next();
+}
+
+// Handles FUSE request. Reads request from /dev/fuse, checks if it has the
+// same opcode as expected, and responds with the saved fake FUSE response.
+// The FUSE request is copied to the serial buffer and can be retrieved one-
+// by-one by calling GetServerActualRequest from testing thread.
+void FuseTest::ServerProcessFuseRequest() {
+  ssize_t len;
+  std::vector<char> buf(FUSE_MIN_READ_BUFFER);
+
+  // Read FUSE request.
+  EXPECT_THAT(len = RetryEINTR(read)(dev_fd_, buf.data(), buf.size()),
+              SyscallSucceeds());
+  fuse_in_header* in_header = reinterpret_cast<fuse_in_header*>(buf.data());
+
+  // Check if this is a preset FUSE_LOOKUP path.
+  if (in_header->opcode == FUSE_LOOKUP) {
+    std::string path(buf.data() + sizeof(struct fuse_in_header));
+    auto it = lookup_map_.find(path);
+    if (it != lookup_map_.end()) {
+      // Matches a preset path. Reply with fake data and skip saving the
+      // request.
+      ServerRespondFuseSuccess(lookups_, it->second, in_header->unique);
+      return;
+    }
+  }
+
+  requests_.AddMemBlock(in_header->opcode, buf.data(), len);
+
+  if (in_header->opcode == FUSE_RELEASE || in_header->opcode == FUSE_RELEASEDIR)
+    return;
+  // Check if there is a corresponding response.
+  if (responses_.End()) {
+    GTEST_NONFATAL_FAILURE_("No more FUSE response is expected");
+    ServerRespondFuseError(in_header->unique);
+    return;
+  }
+  auto mem_block = responses_.Next();
+  if (in_header->opcode != mem_block.opcode) {
+    std::string message = absl::StrFormat("Expect opcode %d but got %d",
+                                          mem_block.opcode, in_header->opcode);
+    GTEST_NONFATAL_FAILURE_(message.c_str());
+    // We won't get correct response if opcode is not expected. Send error
+    // response here to avoid wrong parsing by VFS.
+    ServerRespondFuseError(in_header->unique);
+    return;
+  }
+
+  // Write FUSE response.
+  ServerRespondFuseSuccess(responses_, mem_block, in_header->unique);
+}
+
+void FuseTest::ServerRespondFuseSuccess(FuseMemBuffer& mem_buf,
+                                        const FuseMemBlock& block,
+                                        uint64_t unique) {
+  fuse_out_header* out_header =
+      reinterpret_cast<fuse_out_header*>(mem_buf.DataAtOffset(block.offset));
+
+  // Patch `unique` in fuse_out_header to avoid EINVAL caused by responding
+  // with an unknown `unique`.
+  out_header->unique = unique;
+  EXPECT_THAT(RetryEINTR(write)(dev_fd_, out_header, block.len),
+              SyscallSucceedsWithValue(block.len));
+}
+
+void FuseTest::ServerRespondFuseError(uint64_t unique) {
+  fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header),
+      .error = ENOSYS,
+      .unique = unique,
+  };
+  EXPECT_THAT(RetryEINTR(write)(dev_fd_, &out_header, sizeof(out_header)),
+              SyscallSucceedsWithValue(sizeof(out_header)));
 }
 
 }  // namespace testing
diff --git a/test/fuse/linux/fuse_base.h b/test/fuse/linux/fuse_base.h
index 3a2f255a9..6ad296ca2 100644
--- a/test/fuse/linux/fuse_base.h
+++ b/test/fuse/linux/fuse_base.h
@@ -16,8 +16,12 @@
 #define GVISOR_TEST_FUSE_FUSE_BASE_H_
 
 #include <linux/fuse.h>
+#include <string.h>
+#include <sys/stat.h>
 #include <sys/uio.h>
 
+#include <iostream>
+#include <unordered_map>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -29,68 +33,216 @@ namespace testing {
 
 constexpr char kMountOpts[] = "rootmode=755,user_id=0,group_id=0";
 
-class FuseTest : public ::testing::Test {
+constexpr struct fuse_init_out kDefaultFUSEInitOutPayload = {.major = 7};
+
+// Internal commands used to communicate between testing thread and the FUSE
+// server. See test/fuse/README.md for further detail.
+enum class FuseTestCmd {
+  kSetResponse = 0,
+  kSetInodeLookup,
+  kGetRequest,
+  kGetNumUnconsumedRequests,
+  kGetNumUnsentResponses,
+  kGetTotalReceivedBytes,
+  kSkipRequest,
+};
+
+// Holds the information of a memory block in a serial buffer.
+struct FuseMemBlock {
+  uint32_t opcode;
+  size_t offset;
+  size_t len;
+};
+
+// A wrapper of a simple serial buffer that can be used with read(2) and
+// write(2). Contains a cursor to indicate accessing. This class is not thread-
+// safe and can only be used in single-thread version.
+class FuseMemBuffer {
  public:
-  FuseTest() {
-    buf_.resize(FUSE_MIN_READ_BUFFER);
-    mem_in_.resize(FUSE_MIN_READ_BUFFER);
-    mem_out_.resize(FUSE_MIN_READ_BUFFER);
+  FuseMemBuffer() : cursor_(0) {
+    // To read from /dev/fuse, a buffer needs at least FUSE_MIN_READ_BUFFER
+    // bytes to avoid EINVAL. FuseMemBuffer holds memory that can accommodate
+    // a sequence of FUSE request/response, so it is initiated with double
+    // minimal requirement.
+    mem_.resize(FUSE_MIN_READ_BUFFER * 2);
   }
-  void SetUp() override;
-  void TearDown() override;
 
-  // CompareRequest is used by the FUSE server and should be implemented to
-  // compare different FUSE operations. It compares the actual FUSE input
-  // request with the expected one set by `SetExpected()`.
-  virtual bool CompareRequest(void* expected_mem, size_t expected_len,
-                              void* real_mem, size_t real_len);
+  // Returns whether there is no memory block.
+  bool Empty() { return blocks_.empty(); }
+
+  // Returns if there is no more remaining memory blocks.
+  bool End() { return cursor_ == blocks_.size(); }
+
+  // Returns how many bytes that have been received.
+  size_t UsedBytes() {
+    return Empty() ? 0 : blocks_.back().offset + blocks_.back().len;
+  }
+
+  // Returns the available bytes remains in the serial buffer.
+  size_t AvailBytes() { return mem_.size() - UsedBytes(); }
+
+  // Appends a memory block information that starts at the tail of the serial
+  // buffer. /dev/fuse requires at least FUSE_MIN_READ_BUFFER bytes to read, or
+  // it will issue EINVAL. If it is not enough, just double the buffer length.
+  void AddMemBlock(uint32_t opcode, void* data, size_t len) {
+    if (AvailBytes() < FUSE_MIN_READ_BUFFER) {
+      mem_.resize(mem_.size() << 1);
+    }
+    size_t offset = UsedBytes();
+    memcpy(mem_.data() + offset, data, len);
+    blocks_.push_back(FuseMemBlock{opcode, offset, len});
+  }
+
+  // Returns the memory address at a specific offset. Used with read(2) or
+  // write(2).
+  char* DataAtOffset(size_t offset) { return mem_.data() + offset; }
+
+  // Returns current memory block pointed by the cursor and increase by 1.
+  FuseMemBlock Next() {
+    if (End()) {
+      std::cerr << "Buffer is already exhausted." << std::endl;
+      return FuseMemBlock{};
+    }
+    return blocks_[cursor_++];
+  }
+
+  // Returns the number of the blocks that has not been requested.
+  size_t RemainingBlocks() { return blocks_.size() - cursor_; }
+
+ private:
+  size_t cursor_;
+  std::vector<FuseMemBlock> blocks_;
+  std::vector<char> mem_;
+};
 
-  // SetExpected is called by the testing main thread. Writes a request-
-  // response pair into FUSE server's member variables via pipe.
-  void SetExpected(struct iovec* iov_in, int iov_in_cnt, struct iovec* iov_out,
-                   int iov_out_cnt);
+// FuseTest base class is useful in FUSE integration test. Inherit this class
+// to automatically set up a fake FUSE server and use the member functions
+// to manipulate with it. Refer to test/fuse/README.md for detailed explanation.
+class FuseTest : public ::testing::Test {
+ public:
+  // nodeid_ is the ID of a fake inode. We starts from 2 since 1 is occupied by
+  // the mount point.
+  FuseTest() : nodeid_(2) {}
+  void SetUp() override;
+  void TearDown() override;
 
-  // WaitCompleted waits for FUSE server to complete its processing. It
-  // complains if the FUSE server responds failure during tests.
-  void WaitCompleted();
+  // Called by the testing thread to set up a fake response for an expected
+  // opcode via socket. This can be used multiple times to define a sequence of
+  // expected FUSE reactions.
+  void SetServerResponse(uint32_t opcode, std::vector<struct iovec>& iovecs);
+
+  // Called by the testing thread to install a fake path under the mount point.
+  // e.g. a file under /mnt/dir/file and moint point is /mnt, then it will look
+  // up "dir/file" in this case.
+  //
+  // It sets a fixed response to the FUSE_LOOKUP requests issued with this
+  // path, pretending there is an inode and avoid ENOENT when testing. If mode
+  // is not given, it creates a regular file with mode 0600.
+  void SetServerInodeLookup(const std::string& path,
+                            mode_t mode = S_IFREG | S_IRUSR | S_IWUSR,
+                            uint64_t size = 512);
+
+  // Called by the testing thread to ask the FUSE server for its next received
+  // FUSE request. Be sure to use the corresponding struct of iovec to receive
+  // data from server.
+  void GetServerActualRequest(std::vector<struct iovec>& iovecs);
+
+  // Called by the testing thread to query the number of unconsumed requests in
+  // the requests_ serial buffer of the FUSE server. TearDown() ensures all
+  // FUSE requests received by the FUSE server were consumed by the testing
+  // thread.
+  uint32_t GetServerNumUnconsumedRequests();
+
+  // Called by the testing thread to query the number of unsent responses in
+  // the responses_ serial buffer of the FUSE server. TearDown() ensures all
+  // preset FUSE responses were sent out by the FUSE server.
+  uint32_t GetServerNumUnsentResponses();
+
+  // Called by the testing thread to ask the FUSE server for its total received
+  // bytes from /dev/fuse.
+  uint32_t GetServerTotalReceivedBytes();
+
+  // Called by the testing thread to ask the FUSE server to skip stored
+  // request data.
+  void SkipServerActualRequest();
 
  protected:
   TempPath mount_point_;
 
- private:
-  void MountFuse();
+  // Opens /dev/fuse and inherit the file descriptor for the FUSE server.
+  void MountFuse(const char* mountOpts = kMountOpts);
+
+  // Creates a socketpair for communication and forks FUSE server.
+  void SetUpFuseServer(
+      const struct fuse_init_out* payload = &kDefaultFUSEInitOutPayload);
+
+  // Unmounts the mountpoint of the FUSE server.
   void UnmountFuse();
 
-  // ConsumeFuseInit is only used during FUSE server setup.
-  PosixError ConsumeFuseInit();
+ private:
+  // Sends a FuseTestCmd and gets a uint32_t data from the FUSE server.
+  inline uint32_t GetServerData(uint32_t cmd);
+
+  // Waits for FUSE server to complete its processing. Complains if the FUSE
+  // server responds any failure during tests.
+  void WaitServerComplete();
 
-  // ReceiveExpected is the FUSE server side's corresponding code of
-  // `SetExpected()`. Save the request-response pair into its memory.
-  void ReceiveExpected();
+  // The FUSE server stays here and waits next command or FUSE request until it
+  // is terminated.
+  void ServerFuseLoop();
 
-  // MarkDone is used by the FUSE server to tell testing main if it's OK to
-  // proceed next command.
-  void MarkDone(bool success);
+  // Used by the FUSE server to tell testing thread if it is OK to proceed next
+  // command. Will be issued after processing each FuseTestCmd.
+  void ServerCompleteWith(bool success);
 
-  // FuseLoop is where the FUSE server stay until it is terminated.
-  void FuseLoop();
+  // Consumes the first FUSE request when mounting FUSE. Replies with a
+  // response with empty payload.
+  PosixError ServerConsumeFuseInit(const struct fuse_init_out* payload);
 
-  // SetUpFuseServer creates 2 pipes for communication and forks FUSE server.
-  void SetUpFuseServer();
+  // A command switch that dispatch different FuseTestCmd to its handler.
+  void ServerHandleCommand();
 
-  // GetPayloadSize is a helper function to get the number of bytes of a
-  // specific FUSE operation struct.
-  size_t GetPayloadSize(uint32_t opcode, bool in);
+  // The FUSE server side's corresponding code of `SetServerResponse()`.
+  // Handles `kSetResponse` command. Saves the fake response into its output
+  // memory queue.
+  void ServerReceiveResponse();
+
+  // The FUSE server side's corresponding code of `SetServerInodeLookup()`.
+  // Handles `kSetInodeLookup` command. Receives an expected file mode and
+  // file path under the mount point.
+  void ServerReceiveInodeLookup();
+
+  // The FUSE server side's corresponding code of `GetServerActualRequest()`.
+  // Handles `kGetRequest` command. Sends the next received request pointed by
+  // the cursor.
+  void ServerSendReceivedRequest();
+
+  // Sends a uint32_t data via socket.
+  inline void ServerSendData(uint32_t data);
+
+  // The FUSE server side's corresponding code of `SkipServerActualRequest()`.
+  // Handles `kSkipRequest` command. Skip the request pointed by current cursor.
+  void ServerSkipReceivedRequest();
+
+  // Handles FUSE request sent to /dev/fuse by its saved responses.
+  void ServerProcessFuseRequest();
+
+  // Responds to FUSE request with a saved data.
+  void ServerRespondFuseSuccess(FuseMemBuffer& mem_buf,
+                                const FuseMemBlock& block, uint64_t unique);
+
+  // Responds an error header to /dev/fuse when bad thing happens.
+  void ServerRespondFuseError(uint64_t unique);
 
   int dev_fd_;
-  int set_expected_[2];
-  int done_[2];
-
-  std::vector<char> buf_;
-  std::vector<char> mem_in_;
-  std::vector<char> mem_out_;
-  ssize_t len_in_;
-  ssize_t len_out_;
+  int sock_[2];
+
+  uint64_t nodeid_;
+  std::unordered_map<std::string, FuseMemBlock> lookup_map_;
+
+  FuseMemBuffer requests_;
+  FuseMemBuffer responses_;
+  FuseMemBuffer lookups_;
 };
 
 }  // namespace testing
diff --git a/test/fuse/linux/fuse_fd_util.cc b/test/fuse/linux/fuse_fd_util.cc
new file mode 100644
index 000000000..30d1157bb
--- /dev/null
+++ b/test/fuse/linux/fuse_fd_util.cc
@@ -0,0 +1,61 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/fuse/linux/fuse_fd_util.h"
+
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <string>
+#include <vector>
+
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fuse_util.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<FileDescriptor> FuseFdTest::OpenPath(const std::string &path,
+                                                  uint32_t flags, uint64_t fh) {
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+  };
+  struct fuse_open_out out_payload = {
+      .fh = fh,
+      .open_flags = flags,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_OPEN, iov_out);
+
+  auto res = Open(path.c_str(), flags);
+  if (res.ok()) {
+    SkipServerActualRequest();
+  }
+  return res;
+}
+
+Cleanup FuseFdTest::CloseFD(FileDescriptor &fd) {
+  return Cleanup([&] {
+    close(fd.release());
+    SkipServerActualRequest();
+  });
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/fuse_fd_util.h b/test/fuse/linux/fuse_fd_util.h
new file mode 100644
index 000000000..066185c94
--- /dev/null
+++ b/test/fuse/linux/fuse_fd_util.h
@@ -0,0 +1,48 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_FUSE_FUSE_FD_UTIL_H_
+#define GVISOR_TEST_FUSE_FUSE_FD_UTIL_H_
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <string>
+
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+class FuseFdTest : public FuseTest {
+ public:
+  // Sets the FUSE server to respond to a FUSE_OPEN with corresponding flags and
+  // fh. Then does a real file system open on the absolute path to get an fd.
+  PosixErrorOr<FileDescriptor> OpenPath(const std::string &path,
+                                        uint32_t flags = O_RDONLY,
+                                        uint64_t fh = 1);
+
+  // Returns a cleanup object that closes the fd when it is destroyed. After
+  // the close is done, tells the FUSE server to skip this FUSE_RELEASE.
+  Cleanup CloseFD(FileDescriptor &fd);
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_FUSE_FUSE_FD_UTIL_H_
diff --git a/test/fuse/linux/mkdir_test.cc b/test/fuse/linux/mkdir_test.cc
new file mode 100644
index 000000000..9647cb93f
--- /dev/null
+++ b/test/fuse/linux/mkdir_test.cc
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MkdirTest : public FuseTest {
+ protected:
+  const std::string test_dir_ = "test_dir";
+  const mode_t perms_ = S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(MkdirTest, CreateDir) {
+  const std::string test_dir_path_ =
+      JoinPath(mount_point_.path().c_str(), test_dir_);
+  const mode_t new_umask = 0077;
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFDIR | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_MKDIR, iov_out);
+  TempUmask mask(new_umask);
+  ASSERT_THAT(mkdir(test_dir_path_.c_str(), 0777), SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  struct fuse_mkdir_in in_payload;
+  std::vector<char> actual_dir(test_dir_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload, actual_dir);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len,
+            sizeof(in_header) + sizeof(in_payload) + test_dir_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_MKDIR);
+  EXPECT_EQ(in_payload.mode & 0777, perms_ & ~new_umask);
+  EXPECT_EQ(in_payload.umask, new_umask);
+  EXPECT_EQ(std::string(actual_dir.data()), test_dir_);
+}
+
+TEST_F(MkdirTest, FileTypeError) {
+  const std::string test_dir_path_ =
+      JoinPath(mount_point_.path().c_str(), test_dir_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFREG | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_MKDIR, iov_out);
+  ASSERT_THAT(mkdir(test_dir_path_.c_str(), 0777), SyscallFailsWithErrno(EIO));
+  SkipServerActualRequest();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/mknod_test.cc b/test/fuse/linux/mknod_test.cc
new file mode 100644
index 000000000..74c74d76b
--- /dev/null
+++ b/test/fuse/linux/mknod_test.cc
@@ -0,0 +1,107 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MknodTest : public FuseTest {
+ protected:
+  const std::string test_file_ = "test_file";
+  const mode_t perms_ = S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(MknodTest, RegularFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  const mode_t new_umask = 0077;
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFREG | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_MKNOD, iov_out);
+  TempUmask mask(new_umask);
+  ASSERT_THAT(mknod(test_file_path.c_str(), perms_, 0), SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  struct fuse_mknod_in in_payload;
+  std::vector<char> actual_file(test_file_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload, actual_file);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len,
+            sizeof(in_header) + sizeof(in_payload) + test_file_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_MKNOD);
+  EXPECT_EQ(in_payload.mode & 0777, perms_ & ~new_umask);
+  EXPECT_EQ(in_payload.umask, new_umask);
+  EXPECT_EQ(in_payload.rdev, 0);
+  EXPECT_EQ(std::string(actual_file.data()), test_file_);
+}
+
+TEST_F(MknodTest, FileTypeError) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  // server return directory instead of regular file should cause an error.
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFDIR | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_MKNOD, iov_out);
+  ASSERT_THAT(mknod(test_file_path.c_str(), perms_, 0),
+              SyscallFailsWithErrno(EIO));
+  SkipServerActualRequest();
+}
+
+TEST_F(MknodTest, NodeIDError) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload =
+      DefaultEntryOut(S_IFREG | perms_, FUSE_ROOT_ID);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_MKNOD, iov_out);
+  ASSERT_THAT(mknod(test_file_path.c_str(), perms_, 0),
+              SyscallFailsWithErrno(EIO));
+  SkipServerActualRequest();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/open_test.cc b/test/fuse/linux/open_test.cc
new file mode 100644
index 000000000..4b0c4a805
--- /dev/null
+++ b/test/fuse/linux/open_test.cc
@@ -0,0 +1,128 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class OpenTest : public FuseTest {
+  // OpenTest doesn't care the release request when close a fd,
+  // so doesn't check leftover requests when tearing down.
+  void TearDown() { UnmountFuse(); }
+
+ protected:
+  const std::string test_file_ = "test_file";
+  const mode_t regular_file_ = S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO;
+
+  struct fuse_open_out out_payload_ = {
+      .fh = 1,
+      .open_flags = O_RDWR,
+  };
+};
+
+TEST_F(OpenTest, RegularFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, regular_file_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload_);
+  SetServerResponse(FUSE_OPEN, iov_out);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_path.c_str(), O_RDWR));
+
+  struct fuse_in_header in_header;
+  struct fuse_open_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_OPEN);
+  EXPECT_EQ(in_payload.flags, O_RDWR);
+  EXPECT_THAT(fcntl(fd.get(), F_GETFL), SyscallSucceedsWithValue(O_RDWR));
+}
+
+TEST_F(OpenTest, SetNoOpen) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, regular_file_);
+
+  // ENOSYS indicates open is not implemented.
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+      .error = -ENOSYS,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload_);
+  SetServerResponse(FUSE_OPEN, iov_out);
+  ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_path.c_str(), O_RDWR));
+  SkipServerActualRequest();
+
+  // check open doesn't send new request.
+  uint32_t recieved_before = GetServerTotalReceivedBytes();
+  ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_path.c_str(), O_RDWR));
+  EXPECT_EQ(GetServerTotalReceivedBytes(), recieved_before);
+}
+
+TEST_F(OpenTest, OpenFail) {
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+      .error = -ENOENT,
+  };
+
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload_);
+  SetServerResponse(FUSE_OPENDIR, iov_out);
+  ASSERT_THAT(open(mount_point_.path().c_str(), O_RDWR),
+              SyscallFailsWithErrno(ENOENT));
+
+  struct fuse_in_header in_header;
+  struct fuse_open_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_OPENDIR);
+  EXPECT_EQ(in_payload.flags, O_RDWR);
+}
+
+TEST_F(OpenTest, DirectoryFlagOnRegularFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+
+  SetServerInodeLookup(test_file_, regular_file_);
+  ASSERT_THAT(open(test_file_path.c_str(), O_RDWR | O_DIRECTORY),
+              SyscallFailsWithErrno(ENOTDIR));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/read_test.cc b/test/fuse/linux/read_test.cc
new file mode 100644
index 000000000..88fc299d8
--- /dev/null
+++ b/test/fuse/linux/read_test.cc
@@ -0,0 +1,390 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadTest : public FuseTest {
+  void SetUp() override {
+    FuseTest::SetUp();
+    test_file_path_ = JoinPath(mount_point_.path().c_str(), test_file_);
+  }
+
+  // TearDown overrides the parent's function
+  // to skip checking the unconsumed release request at the end.
+  void TearDown() override { UnmountFuse(); }
+
+ protected:
+  const std::string test_file_ = "test_file";
+  const mode_t test_file_mode_ = S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO;
+  const uint64_t test_fh_ = 1;
+  const uint32_t open_flag_ = O_RDWR;
+
+  std::string test_file_path_;
+
+  PosixErrorOr<FileDescriptor> OpenTestFile(const std::string &path,
+                                            uint64_t size = 512) {
+    SetServerInodeLookup(test_file_, test_file_mode_, size);
+
+    struct fuse_out_header out_header_open = {
+        .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+    };
+    struct fuse_open_out out_payload_open = {
+        .fh = test_fh_,
+        .open_flags = open_flag_,
+    };
+    auto iov_out_open = FuseGenerateIovecs(out_header_open, out_payload_open);
+    SetServerResponse(FUSE_OPEN, iov_out_open);
+
+    auto res = Open(path.c_str(), open_flag_);
+    if (res.ok()) {
+      SkipServerActualRequest();
+    }
+    return res;
+  }
+};
+
+class ReadTestSmallMaxRead : public ReadTest {
+  void SetUp() override {
+    MountFuse(mountOpts);
+    SetUpFuseServer();
+    test_file_path_ = JoinPath(mount_point_.path().c_str(), test_file_);
+  }
+
+ protected:
+  constexpr static char mountOpts[] =
+      "rootmode=755,user_id=0,group_id=0,max_read=4096";
+  // 4096 is hard-coded as the max_read in mount options.
+  const int size_fragment = 4096;
+};
+
+TEST_F(ReadTest, ReadWhole) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the read.
+  const int n_read = 5;
+  std::vector<char> data(n_read);
+  RandomizeBuffer(data.data(), data.size());
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+  SetServerResponse(FUSE_READ, iov_out_read);
+
+  // Read the whole "file".
+  std::vector<char> buf(n_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), n_read),
+              SyscallSucceedsWithValue(n_read));
+
+  // Check the read request.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_header_read.len,
+            sizeof(in_header_read) + sizeof(in_payload_read));
+  EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+  EXPECT_EQ(in_payload_read.offset, 0);
+  EXPECT_EQ(buf, data);
+}
+
+TEST_F(ReadTest, ReadPartial) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the read.
+  const int n_data = 10;
+  std::vector<char> data(n_data);
+  RandomizeBuffer(data.data(), data.size());
+  // Note: due to read ahead, current read implementation will treat any
+  // response that is longer than requested as correct (i.e. not reach the EOF).
+  // Therefore, the test below should make sure the size to read does not exceed
+  // n_data.
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+
+  std::vector<char> buf(n_data);
+
+  // Read 1 bytes.
+  SetServerResponse(FUSE_READ, iov_out_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), 1), SyscallSucceedsWithValue(1));
+
+  // Check the 1-byte read request.
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_header_read.len,
+            sizeof(in_header_read) + sizeof(in_payload_read));
+  EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+  EXPECT_EQ(in_payload_read.offset, 0);
+
+  // Read 3 bytes.
+  SetServerResponse(FUSE_READ, iov_out_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), 3), SyscallSucceedsWithValue(3));
+
+  // Check the 3-byte read request.
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_payload_read.offset, 1);
+
+  // Read 5 bytes.
+  SetServerResponse(FUSE_READ, iov_out_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), 5), SyscallSucceedsWithValue(5));
+
+  // Check the 5-byte read request.
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_payload_read.offset, 4);
+}
+
+TEST_F(ReadTest, PRead) {
+  const int file_size = 512;
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_, file_size));
+
+  // Prepare for the read.
+  const int n_read = 5;
+  std::vector<char> data(n_read);
+  RandomizeBuffer(data.data(), data.size());
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+  SetServerResponse(FUSE_READ, iov_out_read);
+
+  // Read some bytes.
+  std::vector<char> buf(n_read);
+  const int offset_read = file_size >> 1;
+  EXPECT_THAT(pread(fd.get(), buf.data(), n_read, offset_read),
+              SyscallSucceedsWithValue(n_read));
+
+  // Check the read request.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_header_read.len,
+            sizeof(in_header_read) + sizeof(in_payload_read));
+  EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+  EXPECT_EQ(in_payload_read.offset, offset_read);
+  EXPECT_EQ(buf, data);
+}
+
+TEST_F(ReadTest, ReadZero) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Issue the read.
+  std::vector<char> buf;
+  EXPECT_THAT(read(fd.get(), buf.data(), 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ReadTest, ReadShort) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the short read.
+  const int n_read = 5;
+  std::vector<char> data(n_read >> 1);
+  RandomizeBuffer(data.data(), data.size());
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+  SetServerResponse(FUSE_READ, iov_out_read);
+
+  // Read the whole "file".
+  std::vector<char> buf(n_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), n_read),
+              SyscallSucceedsWithValue(data.size()));
+
+  // Check the read request.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_header_read.len,
+            sizeof(in_header_read) + sizeof(in_payload_read));
+  EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+  EXPECT_EQ(in_payload_read.offset, 0);
+  std::vector<char> short_buf(buf.begin(), buf.begin() + data.size());
+  EXPECT_EQ(short_buf, data);
+}
+
+TEST_F(ReadTest, ReadShortEOF) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the short read.
+  struct fuse_out_header out_header_read = {
+      .len = static_cast<uint32_t>(sizeof(struct fuse_out_header)),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read);
+  SetServerResponse(FUSE_READ, iov_out_read);
+
+  // Read the whole "file".
+  const int n_read = 10;
+  std::vector<char> buf(n_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), n_read), SyscallSucceedsWithValue(0));
+
+  // Check the read request.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_payload_read.fh, test_fh_);
+  EXPECT_EQ(in_header_read.len,
+            sizeof(in_header_read) + sizeof(in_payload_read));
+  EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+  EXPECT_EQ(in_payload_read.offset, 0);
+}
+
+TEST_F(ReadTestSmallMaxRead, ReadSmallMaxRead) {
+  const int n_fragment = 10;
+  const int n_read = size_fragment * n_fragment;
+
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_, n_read));
+
+  // Prepare for the read.
+  std::vector<char> data(size_fragment);
+  RandomizeBuffer(data.data(), data.size());
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+
+  for (int i = 0; i < n_fragment; ++i) {
+    SetServerResponse(FUSE_READ, iov_out_read);
+  }
+
+  // Read the whole "file".
+  std::vector<char> buf(n_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), n_read),
+              SyscallSucceedsWithValue(n_read));
+
+  ASSERT_EQ(GetServerNumUnsentResponses(), 0);
+  ASSERT_EQ(GetServerNumUnconsumedRequests(), n_fragment);
+
+  // Check each read segment.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+
+  for (int i = 0; i < n_fragment; ++i) {
+    GetServerActualRequest(iov_in);
+    EXPECT_EQ(in_payload_read.fh, test_fh_);
+    EXPECT_EQ(in_header_read.len,
+              sizeof(in_header_read) + sizeof(in_payload_read));
+    EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+    EXPECT_EQ(in_payload_read.offset, i * size_fragment);
+    EXPECT_EQ(in_payload_read.size, size_fragment);
+
+    auto it = buf.begin() + i * size_fragment;
+    EXPECT_EQ(std::vector<char>(it, it + size_fragment), data);
+  }
+}
+
+TEST_F(ReadTestSmallMaxRead, ReadSmallMaxReadShort) {
+  const int n_fragment = 10;
+  const int n_read = size_fragment * n_fragment;
+
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_, n_read));
+
+  // Prepare for the read.
+  std::vector<char> data(size_fragment);
+  RandomizeBuffer(data.data(), data.size());
+  struct fuse_out_header out_header_read = {
+      .len =
+          static_cast<uint32_t>(sizeof(struct fuse_out_header) + data.size()),
+  };
+  auto iov_out_read = FuseGenerateIovecs(out_header_read, data);
+
+  for (int i = 0; i < n_fragment - 1; ++i) {
+    SetServerResponse(FUSE_READ, iov_out_read);
+  }
+
+  // The last fragment is a short read.
+  std::vector<char> half_data(data.begin(), data.begin() + (data.size() >> 1));
+  struct fuse_out_header out_header_read_short = {
+      .len = static_cast<uint32_t>(sizeof(struct fuse_out_header) +
+                                   half_data.size()),
+  };
+  auto iov_out_read_short =
+      FuseGenerateIovecs(out_header_read_short, half_data);
+  SetServerResponse(FUSE_READ, iov_out_read_short);
+
+  // Read the whole "file".
+  std::vector<char> buf(n_read);
+  EXPECT_THAT(read(fd.get(), buf.data(), n_read),
+              SyscallSucceedsWithValue(n_read - (data.size() >> 1)));
+
+  ASSERT_EQ(GetServerNumUnsentResponses(), 0);
+  ASSERT_EQ(GetServerNumUnconsumedRequests(), n_fragment);
+
+  // Check each read segment.
+  struct fuse_in_header in_header_read;
+  struct fuse_read_in in_payload_read;
+  auto iov_in = FuseGenerateIovecs(in_header_read, in_payload_read);
+
+  for (int i = 0; i < n_fragment; ++i) {
+    GetServerActualRequest(iov_in);
+    EXPECT_EQ(in_payload_read.fh, test_fh_);
+    EXPECT_EQ(in_header_read.len,
+              sizeof(in_header_read) + sizeof(in_payload_read));
+    EXPECT_EQ(in_header_read.opcode, FUSE_READ);
+    EXPECT_EQ(in_payload_read.offset, i * size_fragment);
+    EXPECT_EQ(in_payload_read.size, size_fragment);
+
+    auto it = buf.begin() + i * size_fragment;
+    if (i != n_fragment - 1) {
+      EXPECT_EQ(std::vector<char>(it, it + data.size()), data);
+    } else {
+      EXPECT_EQ(std::vector<char>(it, it + half_data.size()), half_data);
+    }
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/readdir_test.cc b/test/fuse/linux/readdir_test.cc
new file mode 100644
index 000000000..2afb4b062
--- /dev/null
+++ b/test/fuse/linux/readdir_test.cc
@@ -0,0 +1,193 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <linux/unistd.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
+#define FUSE_DIRENT_ALIGN(x) \
+  (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
+#define FUSE_DIRENT_SIZE(d) FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReaddirTest : public FuseTest {
+ public:
+  void fill_fuse_dirent(char *buf, const char *name, uint64_t ino) {
+    size_t namelen = strlen(name);
+    size_t entlen = FUSE_NAME_OFFSET + namelen;
+    size_t entlen_padded = FUSE_DIRENT_ALIGN(entlen);
+    struct fuse_dirent *dirent;
+
+    dirent = reinterpret_cast<struct fuse_dirent *>(buf);
+    dirent->ino = ino;
+    dirent->namelen = namelen;
+    memcpy(dirent->name, name, namelen);
+    memset(dirent->name + namelen, 0, entlen_padded - entlen);
+  }
+
+ protected:
+  const std::string test_dir_name_ = "test_dir";
+};
+
+TEST_F(ReaddirTest, SingleEntry) {
+  const std::string test_dir_path =
+      JoinPath(mount_point_.path().c_str(), test_dir_name_);
+
+  const uint64_t ino_dir = 1024;
+  // We need to make sure the test dir is a directory that can be found.
+  mode_t expected_mode =
+      S_IFDIR | S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+  struct fuse_attr dir_attr = {
+      .ino = ino_dir,
+      .size = 512,
+      .blocks = 4,
+      .mode = expected_mode,
+      .blksize = 4096,
+  };
+
+  // We need to make sure the test dir is a directory that can be found.
+  struct fuse_out_header lookup_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out lookup_payload = {
+      .nodeid = 1,
+      .entry_valid = true,
+      .attr_valid = true,
+      .attr = dir_attr,
+  };
+
+  struct fuse_out_header open_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+  };
+  struct fuse_open_out open_payload = {
+      .fh = 1,
+  };
+  auto iov_out = FuseGenerateIovecs(lookup_header, lookup_payload);
+  SetServerResponse(FUSE_LOOKUP, iov_out);
+
+  iov_out = FuseGenerateIovecs(open_header, open_payload);
+  SetServerResponse(FUSE_OPENDIR, iov_out);
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_dir_path.c_str(), O_RDONLY));
+
+  // The open command makes two syscalls. Lookup the dir file and open.
+  // We don't need to inspect those headers in this test.
+  SkipServerActualRequest();  // LOOKUP.
+  SkipServerActualRequest();  // OPENDIR.
+
+  // Readdir test code.
+  std::string dot = ".";
+  std::string dot_dot = "..";
+  std::string test_file = "testFile";
+
+  // Figure out how many dirents to send over and allocate them appropriately.
+  // Each dirent has a dynamic name and a static metadata part. The dirent size
+  // is aligned to being a multiple of 8.
+  size_t dot_file_dirent_size =
+      FUSE_DIRENT_ALIGN(dot.length() + FUSE_NAME_OFFSET);
+  size_t dot_dot_file_dirent_size =
+      FUSE_DIRENT_ALIGN(dot_dot.length() + FUSE_NAME_OFFSET);
+  size_t test_file_dirent_size =
+      FUSE_DIRENT_ALIGN(test_file.length() + FUSE_NAME_OFFSET);
+
+  // Create an appropriately sized payload.
+  size_t readdir_payload_size =
+      test_file_dirent_size + dot_file_dirent_size + dot_dot_file_dirent_size;
+  std::vector<char> readdir_payload_vec(readdir_payload_size);
+  char *readdir_payload = readdir_payload_vec.data();
+
+  // Use fake ino for other directories.
+  fill_fuse_dirent(readdir_payload, dot.c_str(), ino_dir - 2);
+  fill_fuse_dirent(readdir_payload + dot_file_dirent_size, dot_dot.c_str(),
+                   ino_dir - 1);
+  fill_fuse_dirent(
+      readdir_payload + dot_file_dirent_size + dot_dot_file_dirent_size,
+      test_file.c_str(), ino_dir);
+
+  struct fuse_out_header readdir_header = {
+      .len = uint32_t(sizeof(struct fuse_out_header) + readdir_payload_size),
+  };
+  struct fuse_out_header readdir_header_break = {
+      .len = uint32_t(sizeof(struct fuse_out_header)),
+  };
+
+  iov_out = FuseGenerateIovecs(readdir_header, readdir_payload_vec);
+  SetServerResponse(FUSE_READDIR, iov_out);
+
+  iov_out = FuseGenerateIovecs(readdir_header_break);
+  SetServerResponse(FUSE_READDIR, iov_out);
+
+  std::vector<char> buf(4090, 0);
+  int nread, off = 0, i = 0;
+  EXPECT_THAT(
+      nread = syscall(__NR_getdents64, fd.get(), buf.data(), buf.size()),
+      SyscallSucceeds());
+  for (; off < nread;) {
+    struct dirent64 *ent = (struct dirent64 *)(buf.data() + off);
+    off += ent->d_reclen;
+    switch (i++) {
+      case 0:
+        EXPECT_EQ(std::string(ent->d_name), dot);
+        break;
+      case 1:
+        EXPECT_EQ(std::string(ent->d_name), dot_dot);
+        break;
+      case 2:
+        EXPECT_EQ(std::string(ent->d_name), test_file);
+        break;
+    }
+  }
+
+  EXPECT_THAT(
+      nread = syscall(__NR_getdents64, fd.get(), buf.data(), buf.size()),
+      SyscallSucceedsWithValue(0));
+
+  SkipServerActualRequest();  // READDIR.
+  SkipServerActualRequest();  // READDIR with no data.
+
+  // Clean up.
+  fd.reset(-1);
+
+  struct fuse_in_header in_header;
+  struct fuse_release_in in_payload;
+
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_RELEASEDIR);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/readlink_test.cc b/test/fuse/linux/readlink_test.cc
new file mode 100644
index 000000000..2cba8fc23
--- /dev/null
+++ b/test/fuse/linux/readlink_test.cc
@@ -0,0 +1,85 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadlinkTest : public FuseTest {
+ protected:
+  const std::string test_file_ = "test_file_";
+  const mode_t perms_ = S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(ReadlinkTest, ReadSymLink) {
+  const std::string symlink_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, S_IFLNK | perms_);
+
+  struct fuse_out_header out_header = {
+      .len = static_cast<uint32_t>(sizeof(struct fuse_out_header)) +
+             static_cast<uint32_t>(test_file_.length()) + 1,
+  };
+  std::string link = test_file_;
+  auto iov_out = FuseGenerateIovecs(out_header, link);
+  SetServerResponse(FUSE_READLINK, iov_out);
+  const std::string actual_link =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadLink(symlink_path));
+
+  struct fuse_in_header in_header;
+  auto iov_in = FuseGenerateIovecs(in_header);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header));
+  EXPECT_EQ(in_header.opcode, FUSE_READLINK);
+  EXPECT_EQ(0, memcmp(actual_link.c_str(), link.data(), link.size()));
+
+  // next readlink should have link cached, so shouldn't have new request to
+  // server.
+  uint32_t recieved_before = GetServerTotalReceivedBytes();
+  ASSERT_NO_ERRNO(ReadLink(symlink_path));
+  EXPECT_EQ(GetServerTotalReceivedBytes(), recieved_before);
+}
+
+TEST_F(ReadlinkTest, NotSymlink) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, S_IFREG | perms_);
+
+  std::vector<char> buf(PATH_MAX + 1);
+  ASSERT_THAT(readlink(test_file_path.c_str(), buf.data(), PATH_MAX),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/release_test.cc b/test/fuse/linux/release_test.cc
new file mode 100644
index 000000000..b5adb0870
--- /dev/null
+++ b/test/fuse/linux/release_test.cc
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReleaseTest : public FuseTest {
+ protected:
+  const std::string test_file_ = "test_file";
+};
+
+TEST_F(ReleaseTest, RegularFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+  };
+  struct fuse_open_out out_payload = {
+      .fh = 1,
+      .open_flags = O_RDWR,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_OPEN, iov_out);
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_path, O_RDWR));
+  SkipServerActualRequest();
+  ASSERT_THAT(close(fd.release()), SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  struct fuse_release_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_RELEASE);
+  EXPECT_EQ(in_payload.flags, O_RDWR);
+  EXPECT_EQ(in_payload.fh, 1);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/rmdir_test.cc b/test/fuse/linux/rmdir_test.cc
new file mode 100644
index 000000000..e3200e446
--- /dev/null
+++ b/test/fuse/linux/rmdir_test.cc
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fs_util.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class RmDirTest : public FuseTest {
+ protected:
+  const std::string test_dir_name_ = "test_dir";
+  const std::string test_subdir_ = "test_subdir";
+  const mode_t test_dir_mode_ = S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(RmDirTest, NormalRmDir) {
+  const std::string test_dir_path_ =
+      JoinPath(mount_point_.path().c_str(), test_dir_name_);
+
+  SetServerInodeLookup(test_dir_name_, test_dir_mode_);
+
+  // RmDir code.
+  struct fuse_out_header rmdir_header = {
+      .len = sizeof(struct fuse_out_header),
+  };
+
+  auto iov_out = FuseGenerateIovecs(rmdir_header);
+  SetServerResponse(FUSE_RMDIR, iov_out);
+
+  ASSERT_THAT(rmdir(test_dir_path_.c_str()), SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  std::vector<char> actual_dirname(test_dir_name_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, actual_dirname);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + test_dir_name_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_RMDIR);
+  EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_);
+}
+
+TEST_F(RmDirTest, NormalRmDirSubdir) {
+  SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
+  const std::string test_dir_path_ =
+      JoinPath(mount_point_.path().c_str(), test_subdir_, test_dir_name_);
+  SetServerInodeLookup(test_dir_name_, test_dir_mode_);
+
+  // RmDir code.
+  struct fuse_out_header rmdir_header = {
+      .len = sizeof(struct fuse_out_header),
+  };
+
+  auto iov_out = FuseGenerateIovecs(rmdir_header);
+  SetServerResponse(FUSE_RMDIR, iov_out);
+
+  ASSERT_THAT(rmdir(test_dir_path_.c_str()), SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  std::vector<char> actual_dirname(test_dir_name_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, actual_dirname);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + test_dir_name_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_RMDIR);
+  EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/setstat_test.cc b/test/fuse/linux/setstat_test.cc
new file mode 100644
index 000000000..68301c775
--- /dev/null
+++ b/test/fuse/linux/setstat_test.cc
@@ -0,0 +1,338 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <utime.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_fd_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class SetStatTest : public FuseFdTest {
+ public:
+  void SetUp() override {
+    FuseFdTest::SetUp();
+    test_dir_path_ = JoinPath(mount_point_.path(), test_dir_);
+    test_file_path_ = JoinPath(mount_point_.path(), test_file_);
+  }
+
+ protected:
+  const uint64_t fh = 23;
+  const std::string test_dir_ = "testdir";
+  const std::string test_file_ = "testfile";
+  const mode_t test_dir_mode_ = S_IFDIR | S_IRUSR | S_IWUSR | S_IXUSR;
+  const mode_t test_file_mode_ = S_IFREG | S_IRUSR | S_IWUSR | S_IXUSR;
+
+  std::string test_dir_path_;
+  std::string test_file_path_;
+};
+
+TEST_F(SetStatTest, ChmodDir) {
+  // Set up fixture.
+  SetServerInodeLookup(test_dir_, test_dir_mode_);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  mode_t set_mode = S_IRGRP | S_IWGRP | S_IXGRP;
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(set_mode, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(chmod(test_dir_path_.c_str(), set_mode), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_MODE);
+  EXPECT_EQ(in_payload.mode, S_IFDIR | set_mode);
+}
+
+TEST_F(SetStatTest, ChownDir) {
+  // Set up fixture.
+  SetServerInodeLookup(test_dir_, test_dir_mode_);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(test_dir_mode_, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(chown(test_dir_path_.c_str(), 1025, 1025), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_UID | FATTR_GID);
+  EXPECT_EQ(in_payload.uid, 1025);
+  EXPECT_EQ(in_payload.gid, 1025);
+}
+
+TEST_F(SetStatTest, TruncateFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(S_IFREG | S_IRUSR | S_IWUSR, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(truncate(test_file_path_.c_str(), 321), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_SIZE);
+  EXPECT_EQ(in_payload.size, 321);
+}
+
+TEST_F(SetStatTest, UtimeFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(S_IFREG | S_IRUSR | S_IWUSR, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  time_t expected_atime = 1597159766, expected_mtime = 1597159765;
+  struct utimbuf times = {
+      .actime = expected_atime,
+      .modtime = expected_mtime,
+  };
+  EXPECT_THAT(utime(test_file_path_.c_str(), &times), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_ATIME | FATTR_MTIME);
+  EXPECT_EQ(in_payload.atime, expected_atime);
+  EXPECT_EQ(in_payload.mtime, expected_mtime);
+}
+
+TEST_F(SetStatTest, UtimesFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(test_file_mode_, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  struct timeval expected_times[2] = {
+      {
+          .tv_sec = 1597159766,
+          .tv_usec = 234945,
+      },
+      {
+          .tv_sec = 1597159765,
+          .tv_usec = 232341,
+      },
+  };
+  EXPECT_THAT(utimes(test_file_path_.c_str(), expected_times),
+              SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_ATIME | FATTR_MTIME);
+  EXPECT_EQ(in_payload.atime, expected_times[0].tv_sec);
+  EXPECT_EQ(in_payload.atimensec, expected_times[0].tv_usec * 1000);
+  EXPECT_EQ(in_payload.mtime, expected_times[1].tv_sec);
+  EXPECT_EQ(in_payload.mtimensec, expected_times[1].tv_usec * 1000);
+}
+
+TEST_F(SetStatTest, FtruncateFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenPath(test_file_path_, O_RDWR, fh));
+  auto close_fd = CloseFD(fd);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(test_file_mode_, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(ftruncate(fd.get(), 321), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_SIZE | FATTR_FH);
+  EXPECT_EQ(in_payload.fh, fh);
+  EXPECT_EQ(in_payload.size, 321);
+}
+
+TEST_F(SetStatTest, FchmodFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenPath(test_file_path_, O_RDWR, fh));
+  auto close_fd = CloseFD(fd);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  mode_t set_mode = S_IROTH | S_IWOTH | S_IXOTH;
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(set_mode, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(fchmod(fd.get(), set_mode), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_MODE | FATTR_FH);
+  EXPECT_EQ(in_payload.fh, fh);
+  EXPECT_EQ(in_payload.mode, S_IFREG | set_mode);
+}
+
+TEST_F(SetStatTest, FchownFile) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, test_file_mode_);
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenPath(test_file_path_, O_RDWR, fh));
+  auto close_fd = CloseFD(fd);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+      .error = 0,
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = DefaultFuseAttr(S_IFREG | S_IRUSR | S_IWUSR | S_IXUSR, 2),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SETATTR, iov_out);
+
+  // Make syscall.
+  EXPECT_THAT(fchown(fd.get(), 1025, 1025), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_setattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.len, sizeof(in_header) + sizeof(in_payload));
+  EXPECT_EQ(in_header.opcode, FUSE_SETATTR);
+  EXPECT_EQ(in_header.uid, 0);
+  EXPECT_EQ(in_header.gid, 0);
+  EXPECT_EQ(in_payload.valid, FATTR_UID | FATTR_GID | FATTR_FH);
+  EXPECT_EQ(in_payload.fh, fh);
+  EXPECT_EQ(in_payload.uid, 1025);
+  EXPECT_EQ(in_payload.gid, 1025);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/stat_test.cc b/test/fuse/linux/stat_test.cc
index 172e09867..6f032cac1 100644
--- a/test/fuse/linux/stat_test.cc
+++ b/test/fuse/linux/stat_test.cc
@@ -18,12 +18,16 @@
 #include <sys/stat.h>
 #include <sys/statfs.h>
 #include <sys/types.h>
+#include <sys/uio.h>
 #include <unistd.h>
 
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "test/fuse/linux/fuse_base.h"
+#include "test/fuse/linux/fuse_fd_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/fuse_util.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -31,89 +35,45 @@ namespace testing {
 
 namespace {
 
-class StatTest : public FuseTest {
+class StatTest : public FuseFdTest {
  public:
-  bool CompareRequest(void* expected_mem, size_t expected_len, void* real_mem,
-                      size_t real_len) override {
-    if (expected_len != real_len) return false;
-    struct fuse_in_header* real_header =
-        reinterpret_cast<fuse_in_header*>(real_mem);
-
-    if (real_header->opcode != FUSE_GETATTR) {
-      std::cerr << "expect header opcode " << FUSE_GETATTR << " but got "
-                << real_header->opcode << std::endl;
-      return false;
-    }
-    return true;
+  void SetUp() override {
+    FuseFdTest::SetUp();
+    test_file_path_ = JoinPath(mount_point_.path(), test_file_);
   }
 
+ protected:
   bool StatsAreEqual(struct stat expected, struct stat actual) {
-    // device number will be dynamically allocated by kernel, we cannot know
-    // in advance
+    // Device number will be dynamically allocated by kernel, we cannot know in
+    // advance.
     actual.st_dev = expected.st_dev;
     return memcmp(&expected, &actual, sizeof(struct stat)) == 0;
   }
+
+  const std::string test_file_ = "testfile";
+  const mode_t expected_mode = S_IFREG | S_IRUSR | S_IWUSR;
+  const uint64_t fh = 23;
+
+  std::string test_file_path_;
 };
 
 TEST_F(StatTest, StatNormal) {
-  struct iovec iov_in[2];
-  struct iovec iov_out[2];
-
-  struct fuse_in_header in_header = {
-      .len = sizeof(struct fuse_in_header) + sizeof(struct fuse_getattr_in),
-      .opcode = FUSE_GETATTR,
-      .unique = 4,
-      .nodeid = 1,
-      .uid = 0,
-      .gid = 0,
-      .pid = 4,
-      .padding = 0,
-  };
-  struct fuse_getattr_in in_payload = {0};
-  iov_in[0].iov_len = sizeof(in_header);
-  iov_in[0].iov_base = &in_header;
-  iov_in[1].iov_len = sizeof(in_payload);
-  iov_in[1].iov_base = &in_payload;
-
-  mode_t expected_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
-  struct timespec atime = {.tv_sec = 1595436289, .tv_nsec = 134150844};
-  struct timespec mtime = {.tv_sec = 1595436290, .tv_nsec = 134150845};
-  struct timespec ctime = {.tv_sec = 1595436291, .tv_nsec = 134150846};
+  // Set up fixture.
+  struct fuse_attr attr = DefaultFuseAttr(expected_mode, 1);
   struct fuse_out_header out_header = {
       .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
-      .error = 0,
-      .unique = 4,
-  };
-  struct fuse_attr attr = {
-      .ino = 1,
-      .size = 512,
-      .blocks = 4,
-      .atime = static_cast<uint64_t>(atime.tv_sec),
-      .mtime = static_cast<uint64_t>(mtime.tv_sec),
-      .ctime = static_cast<uint64_t>(ctime.tv_sec),
-      .atimensec = static_cast<uint32_t>(atime.tv_nsec),
-      .mtimensec = static_cast<uint32_t>(mtime.tv_nsec),
-      .ctimensec = static_cast<uint32_t>(ctime.tv_nsec),
-      .mode = expected_mode,
-      .nlink = 2,
-      .uid = 1234,
-      .gid = 4321,
-      .rdev = 12,
-      .blksize = 4096,
   };
   struct fuse_attr_out out_payload = {
       .attr = attr,
   };
-  iov_out[0].iov_len = sizeof(out_header);
-  iov_out[0].iov_base = &out_header;
-  iov_out[1].iov_len = sizeof(out_payload);
-  iov_out[1].iov_base = &out_payload;
-
-  SetExpected(iov_in, 2, iov_out, 2);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_GETATTR, iov_out);
 
+  // Make syscall.
   struct stat stat_buf;
   EXPECT_THAT(stat(mount_point_.path().c_str(), &stat_buf), SyscallSucceeds());
 
+  // Check filesystem operation result.
   struct stat expected_stat = {
       .st_ino = attr.ino,
       .st_nlink = attr.nlink,
@@ -124,43 +84,133 @@ TEST_F(StatTest, StatNormal) {
       .st_size = static_cast<off_t>(attr.size),
       .st_blksize = attr.blksize,
       .st_blocks = static_cast<blkcnt_t>(attr.blocks),
-      .st_atim = atime,
-      .st_mtim = mtime,
-      .st_ctim = ctime,
+      .st_atim = (struct timespec){.tv_sec = static_cast<int>(attr.atime),
+                                   .tv_nsec = attr.atimensec},
+      .st_mtim = (struct timespec){.tv_sec = static_cast<int>(attr.mtime),
+                                   .tv_nsec = attr.mtimensec},
+      .st_ctim = (struct timespec){.tv_sec = static_cast<int>(attr.ctime),
+                                   .tv_nsec = attr.ctimensec},
   };
   EXPECT_TRUE(StatsAreEqual(stat_buf, expected_stat));
-  WaitCompleted();
-}
 
-TEST_F(StatTest, StatNotFound) {
-  struct iovec iov_in[2];
-  struct iovec iov_out[2];
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_getattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
 
-  struct fuse_in_header in_header = {
-      .len = sizeof(struct fuse_in_header) + sizeof(struct fuse_getattr_in),
-      .opcode = FUSE_GETATTR,
-      .unique = 4,
-  };
-  struct fuse_getattr_in in_payload = {0};
-  iov_in[0].iov_len = sizeof(in_header);
-  iov_in[0].iov_base = &in_header;
-  iov_in[1].iov_len = sizeof(in_payload);
-  iov_in[1].iov_base = &in_payload;
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.opcode, FUSE_GETATTR);
+  EXPECT_EQ(in_payload.getattr_flags, 0);
+  EXPECT_EQ(in_payload.fh, 0);
+}
 
+TEST_F(StatTest, StatNotFound) {
+  // Set up fixture.
   struct fuse_out_header out_header = {
       .len = sizeof(struct fuse_out_header),
       .error = -ENOENT,
-      .unique = 4,
   };
-  iov_out[0].iov_len = sizeof(out_header);
-  iov_out[0].iov_base = &out_header;
-
-  SetExpected(iov_in, 2, iov_out, 1);
+  auto iov_out = FuseGenerateIovecs(out_header);
+  SetServerResponse(FUSE_GETATTR, iov_out);
 
+  // Make syscall.
   struct stat stat_buf;
   EXPECT_THAT(stat(mount_point_.path().c_str(), &stat_buf),
               SyscallFailsWithErrno(ENOENT));
-  WaitCompleted();
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_getattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.opcode, FUSE_GETATTR);
+  EXPECT_EQ(in_payload.getattr_flags, 0);
+  EXPECT_EQ(in_payload.fh, 0);
+}
+
+TEST_F(StatTest, FstatNormal) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_);
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenPath(test_file_path_, O_RDONLY, fh));
+  auto close_fd = CloseFD(fd);
+
+  struct fuse_attr attr = DefaultFuseAttr(expected_mode, 2);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = attr,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_GETATTR, iov_out);
+
+  // Make syscall.
+  struct stat stat_buf;
+  EXPECT_THAT(fstat(fd.get(), &stat_buf), SyscallSucceeds());
+
+  // Check filesystem operation result.
+  struct stat expected_stat = {
+      .st_ino = attr.ino,
+      .st_nlink = attr.nlink,
+      .st_mode = expected_mode,
+      .st_uid = attr.uid,
+      .st_gid = attr.gid,
+      .st_rdev = attr.rdev,
+      .st_size = static_cast<off_t>(attr.size),
+      .st_blksize = attr.blksize,
+      .st_blocks = static_cast<blkcnt_t>(attr.blocks),
+      .st_atim = (struct timespec){.tv_sec = static_cast<int>(attr.atime),
+                                   .tv_nsec = attr.atimensec},
+      .st_mtim = (struct timespec){.tv_sec = static_cast<int>(attr.mtime),
+                                   .tv_nsec = attr.mtimensec},
+      .st_ctim = (struct timespec){.tv_sec = static_cast<int>(attr.ctime),
+                                   .tv_nsec = attr.ctimensec},
+  };
+  EXPECT_TRUE(StatsAreEqual(stat_buf, expected_stat));
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_getattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.opcode, FUSE_GETATTR);
+  EXPECT_EQ(in_payload.getattr_flags, 0);
+  EXPECT_EQ(in_payload.fh, 0);
+}
+
+TEST_F(StatTest, StatByFileHandle) {
+  // Set up fixture.
+  SetServerInodeLookup(test_file_, expected_mode, 0);
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenPath(test_file_path_, O_RDONLY, fh));
+  auto close_fd = CloseFD(fd);
+
+  struct fuse_attr attr = DefaultFuseAttr(expected_mode, 2, 0);
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out),
+  };
+  struct fuse_attr_out out_payload = {
+      .attr = attr,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_GETATTR, iov_out);
+
+  // Make syscall.
+  std::vector<char> buf(1);
+  // Since this is an empty file, it won't issue FUSE_READ. But a FUSE_GETATTR
+  // will be issued before read completes.
+  EXPECT_THAT(read(fd.get(), buf.data(), buf.size()), SyscallSucceeds());
+
+  // Check FUSE request.
+  struct fuse_in_header in_header;
+  struct fuse_getattr_in in_payload;
+  auto iov_in = FuseGenerateIovecs(in_header, in_payload);
+
+  GetServerActualRequest(iov_in);
+  EXPECT_EQ(in_header.opcode, FUSE_GETATTR);
+  EXPECT_EQ(in_payload.getattr_flags, FUSE_GETATTR_FH);
+  EXPECT_EQ(in_payload.fh, fh);
 }
 
 }  // namespace
diff --git a/test/fuse/linux/symlink_test.cc b/test/fuse/linux/symlink_test.cc
new file mode 100644
index 000000000..2c3a52987
--- /dev/null
+++ b/test/fuse/linux/symlink_test.cc
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class SymlinkTest : public FuseTest {
+ protected:
+  const std::string target_file_ = "target_file_";
+  const std::string symlink_ = "symlink_";
+  const mode_t perms_ = S_IRWXU | S_IRWXG | S_IRWXO;
+};
+
+TEST_F(SymlinkTest, CreateSymLink) {
+  const std::string symlink_path =
+      JoinPath(mount_point_.path().c_str(), symlink_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFLNK | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SYMLINK, iov_out);
+  ASSERT_THAT(symlink(target_file_.c_str(), symlink_path.c_str()),
+              SyscallSucceeds());
+
+  struct fuse_in_header in_header;
+  std::vector<char> actual_target_file(target_file_.length() + 1);
+  std::vector<char> actual_symlink(symlink_.length() + 1);
+  auto iov_in =
+      FuseGenerateIovecs(in_header, actual_symlink, actual_target_file);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len,
+            sizeof(in_header) + symlink_.length() + target_file_.length() + 2);
+  EXPECT_EQ(in_header.opcode, FUSE_SYMLINK);
+  EXPECT_EQ(std::string(actual_target_file.data()), target_file_);
+  EXPECT_EQ(std::string(actual_symlink.data()), symlink_);
+}
+
+TEST_F(SymlinkTest, FileTypeError) {
+  const std::string symlink_path =
+      JoinPath(mount_point_.path().c_str(), symlink_);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_entry_out),
+  };
+  struct fuse_entry_out out_payload = DefaultEntryOut(S_IFREG | perms_, 5);
+  auto iov_out = FuseGenerateIovecs(out_header, out_payload);
+  SetServerResponse(FUSE_SYMLINK, iov_out);
+  ASSERT_THAT(symlink(target_file_.c_str(), symlink_path.c_str()),
+              SyscallFailsWithErrno(EIO));
+  SkipServerActualRequest();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/unlink_test.cc b/test/fuse/linux/unlink_test.cc
new file mode 100644
index 000000000..13efbf7c7
--- /dev/null
+++ b/test/fuse/linux/unlink_test.cc
@@ -0,0 +1,107 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class UnlinkTest : public FuseTest {
+ protected:
+  const std::string test_file_ = "test_file";
+  const std::string test_subdir_ = "test_subdir";
+};
+
+TEST_F(UnlinkTest, RegularFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header);
+  SetServerResponse(FUSE_UNLINK, iov_out);
+
+  ASSERT_THAT(unlink(test_file_path.c_str()), SyscallSucceeds());
+  struct fuse_in_header in_header;
+  std::vector<char> unlinked_file(test_file_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, unlinked_file);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + test_file_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_UNLINK);
+  EXPECT_EQ(std::string(unlinked_file.data()), test_file_);
+}
+
+TEST_F(UnlinkTest, RegularFileSubDir) {
+  SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_subdir_, test_file_);
+  SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header),
+  };
+  auto iov_out = FuseGenerateIovecs(out_header);
+  SetServerResponse(FUSE_UNLINK, iov_out);
+
+  ASSERT_THAT(unlink(test_file_path.c_str()), SyscallSucceeds());
+  struct fuse_in_header in_header;
+  std::vector<char> unlinked_file(test_file_.length() + 1);
+  auto iov_in = FuseGenerateIovecs(in_header, unlinked_file);
+  GetServerActualRequest(iov_in);
+
+  EXPECT_EQ(in_header.len, sizeof(in_header) + test_file_.length() + 1);
+  EXPECT_EQ(in_header.opcode, FUSE_UNLINK);
+  EXPECT_EQ(std::string(unlinked_file.data()), test_file_);
+}
+
+TEST_F(UnlinkTest, NoFile) {
+  const std::string test_file_path =
+      JoinPath(mount_point_.path().c_str(), test_file_);
+  SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO);
+
+  struct fuse_out_header out_header = {
+      .len = sizeof(struct fuse_out_header),
+      .error = -ENOENT,
+  };
+  auto iov_out = FuseGenerateIovecs(out_header);
+  SetServerResponse(FUSE_UNLINK, iov_out);
+
+  ASSERT_THAT(unlink(test_file_path.c_str()), SyscallFailsWithErrno(ENOENT));
+  SkipServerActualRequest();
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/fuse/linux/write_test.cc b/test/fuse/linux/write_test.cc
new file mode 100644
index 000000000..1a62beb96
--- /dev/null
+++ b/test/fuse/linux/write_test.cc
@@ -0,0 +1,303 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fuse.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/fuse/linux/fuse_base.h"
+#include "test/util/fuse_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class WriteTest : public FuseTest {
+  void SetUp() override {
+    FuseTest::SetUp();
+    test_file_path_ = JoinPath(mount_point_.path().c_str(), test_file_);
+  }
+
+  // TearDown overrides the parent's function
+  // to skip checking the unconsumed release request at the end.
+  void TearDown() override { UnmountFuse(); }
+
+ protected:
+  const std::string test_file_ = "test_file";
+  const mode_t test_file_mode_ = S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO;
+  const uint64_t test_fh_ = 1;
+  const uint32_t open_flag_ = O_RDWR;
+
+  std::string test_file_path_;
+
+  PosixErrorOr<FileDescriptor> OpenTestFile(const std::string &path,
+                                            uint64_t size = 512) {
+    SetServerInodeLookup(test_file_, test_file_mode_, size);
+
+    struct fuse_out_header out_header_open = {
+        .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_open_out),
+    };
+    struct fuse_open_out out_payload_open = {
+        .fh = test_fh_,
+        .open_flags = open_flag_,
+    };
+    auto iov_out_open = FuseGenerateIovecs(out_header_open, out_payload_open);
+    SetServerResponse(FUSE_OPEN, iov_out_open);
+
+    auto res = Open(path.c_str(), open_flag_);
+    if (res.ok()) {
+      SkipServerActualRequest();
+    }
+    return res;
+  }
+};
+
+class WriteTestSmallMaxWrite : public WriteTest {
+  void SetUp() override {
+    MountFuse();
+    SetUpFuseServer(&fuse_init_payload);
+    test_file_path_ = JoinPath(mount_point_.path().c_str(), test_file_);
+  }
+
+ protected:
+  const static uint32_t max_write_ = 4096;
+  constexpr static struct fuse_init_out fuse_init_payload = {
+      .major = 7,
+      .max_write = max_write_,
+  };
+
+  const uint32_t size_fragment = max_write_;
+};
+
+TEST_F(WriteTest, WriteNormal) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the write.
+  const int n_write = 10;
+  struct fuse_out_header out_header_write = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_write_out),
+  };
+  struct fuse_write_out out_payload_write = {
+      .size = n_write,
+  };
+  auto iov_out_write = FuseGenerateIovecs(out_header_write, out_payload_write);
+  SetServerResponse(FUSE_WRITE, iov_out_write);
+
+  // Issue the write.
+  std::vector<char> buf(n_write);
+  RandomizeBuffer(buf.data(), buf.size());
+  EXPECT_THAT(write(fd.get(), buf.data(), n_write),
+              SyscallSucceedsWithValue(n_write));
+
+  // Check the write request.
+  struct fuse_in_header in_header_write;
+  struct fuse_write_in in_payload_write;
+  std::vector<char> payload_buf(n_write);
+  auto iov_in_write =
+      FuseGenerateIovecs(in_header_write, in_payload_write, payload_buf);
+  GetServerActualRequest(iov_in_write);
+
+  EXPECT_EQ(in_payload_write.fh, test_fh_);
+  EXPECT_EQ(in_header_write.len,
+            sizeof(in_header_write) + sizeof(in_payload_write));
+  EXPECT_EQ(in_header_write.opcode, FUSE_WRITE);
+  EXPECT_EQ(in_payload_write.offset, 0);
+  EXPECT_EQ(in_payload_write.size, n_write);
+  EXPECT_EQ(buf, payload_buf);
+}
+
+TEST_F(WriteTest, WriteShort) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the write.
+  const int n_write = 10, n_written = 5;
+  struct fuse_out_header out_header_write = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_write_out),
+  };
+  struct fuse_write_out out_payload_write = {
+      .size = n_written,
+  };
+  auto iov_out_write = FuseGenerateIovecs(out_header_write, out_payload_write);
+  SetServerResponse(FUSE_WRITE, iov_out_write);
+
+  // Issue the write.
+  std::vector<char> buf(n_write);
+  RandomizeBuffer(buf.data(), buf.size());
+  EXPECT_THAT(write(fd.get(), buf.data(), n_write),
+              SyscallSucceedsWithValue(n_written));
+
+  // Check the write request.
+  struct fuse_in_header in_header_write;
+  struct fuse_write_in in_payload_write;
+  std::vector<char> payload_buf(n_write);
+  auto iov_in_write =
+      FuseGenerateIovecs(in_header_write, in_payload_write, payload_buf);
+  GetServerActualRequest(iov_in_write);
+
+  EXPECT_EQ(in_payload_write.fh, test_fh_);
+  EXPECT_EQ(in_header_write.len,
+            sizeof(in_header_write) + sizeof(in_payload_write));
+  EXPECT_EQ(in_header_write.opcode, FUSE_WRITE);
+  EXPECT_EQ(in_payload_write.offset, 0);
+  EXPECT_EQ(in_payload_write.size, n_write);
+  EXPECT_EQ(buf, payload_buf);
+}
+
+TEST_F(WriteTest, WriteShortZero) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Prepare for the write.
+  const int n_write = 10;
+  struct fuse_out_header out_header_write = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_write_out),
+  };
+  struct fuse_write_out out_payload_write = {
+      .size = 0,
+  };
+  auto iov_out_write = FuseGenerateIovecs(out_header_write, out_payload_write);
+  SetServerResponse(FUSE_WRITE, iov_out_write);
+
+  // Issue the write.
+  std::vector<char> buf(n_write);
+  RandomizeBuffer(buf.data(), buf.size());
+  EXPECT_THAT(write(fd.get(), buf.data(), n_write), SyscallFailsWithErrno(EIO));
+
+  // Check the write request.
+  struct fuse_in_header in_header_write;
+  struct fuse_write_in in_payload_write;
+  std::vector<char> payload_buf(n_write);
+  auto iov_in_write =
+      FuseGenerateIovecs(in_header_write, in_payload_write, payload_buf);
+  GetServerActualRequest(iov_in_write);
+
+  EXPECT_EQ(in_payload_write.fh, test_fh_);
+  EXPECT_EQ(in_header_write.len,
+            sizeof(in_header_write) + sizeof(in_payload_write));
+  EXPECT_EQ(in_header_write.opcode, FUSE_WRITE);
+  EXPECT_EQ(in_payload_write.offset, 0);
+  EXPECT_EQ(in_payload_write.size, n_write);
+  EXPECT_EQ(buf, payload_buf);
+}
+
+TEST_F(WriteTest, WriteZero) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_));
+
+  // Issue the write.
+  std::vector<char> buf(0);
+  EXPECT_THAT(write(fd.get(), buf.data(), 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(WriteTest, PWrite) {
+  const int file_size = 512;
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_, file_size));
+
+  // Prepare for the write.
+  const int n_write = 10;
+  struct fuse_out_header out_header_write = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_write_out),
+  };
+  struct fuse_write_out out_payload_write = {
+      .size = n_write,
+  };
+  auto iov_out_write = FuseGenerateIovecs(out_header_write, out_payload_write);
+  SetServerResponse(FUSE_WRITE, iov_out_write);
+
+  // Issue the write.
+  std::vector<char> buf(n_write);
+  RandomizeBuffer(buf.data(), buf.size());
+  const int offset_write = file_size >> 1;
+  EXPECT_THAT(pwrite(fd.get(), buf.data(), n_write, offset_write),
+              SyscallSucceedsWithValue(n_write));
+
+  // Check the write request.
+  struct fuse_in_header in_header_write;
+  struct fuse_write_in in_payload_write;
+  std::vector<char> payload_buf(n_write);
+  auto iov_in_write =
+      FuseGenerateIovecs(in_header_write, in_payload_write, payload_buf);
+  GetServerActualRequest(iov_in_write);
+
+  EXPECT_EQ(in_payload_write.fh, test_fh_);
+  EXPECT_EQ(in_header_write.len,
+            sizeof(in_header_write) + sizeof(in_payload_write));
+  EXPECT_EQ(in_header_write.opcode, FUSE_WRITE);
+  EXPECT_EQ(in_payload_write.offset, offset_write);
+  EXPECT_EQ(in_payload_write.size, n_write);
+  EXPECT_EQ(buf, payload_buf);
+}
+
+TEST_F(WriteTestSmallMaxWrite, WriteSmallMaxWrie) {
+  const int n_fragment = 10;
+  const int n_write = size_fragment * n_fragment;
+
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenTestFile(test_file_path_, n_write));
+
+  // Prepare for the write.
+  struct fuse_out_header out_header_write = {
+      .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_write_out),
+  };
+  struct fuse_write_out out_payload_write = {
+      .size = size_fragment,
+  };
+  auto iov_out_write = FuseGenerateIovecs(out_header_write, out_payload_write);
+
+  for (int i = 0; i < n_fragment; ++i) {
+    SetServerResponse(FUSE_WRITE, iov_out_write);
+  }
+
+  // Issue the write.
+  std::vector<char> buf(n_write);
+  RandomizeBuffer(buf.data(), buf.size());
+  EXPECT_THAT(write(fd.get(), buf.data(), n_write),
+              SyscallSucceedsWithValue(n_write));
+
+  ASSERT_EQ(GetServerNumUnsentResponses(), 0);
+  ASSERT_EQ(GetServerNumUnconsumedRequests(), n_fragment);
+
+  // Check the write request.
+  struct fuse_in_header in_header_write;
+  struct fuse_write_in in_payload_write;
+  std::vector<char> payload_buf(size_fragment);
+  auto iov_in_write =
+      FuseGenerateIovecs(in_header_write, in_payload_write, payload_buf);
+
+  for (int i = 0; i < n_fragment; ++i) {
+    GetServerActualRequest(iov_in_write);
+
+    EXPECT_EQ(in_payload_write.fh, test_fh_);
+    EXPECT_EQ(in_header_write.len,
+              sizeof(in_header_write) + sizeof(in_payload_write));
+    EXPECT_EQ(in_header_write.opcode, FUSE_WRITE);
+    EXPECT_EQ(in_payload_write.offset, i * size_fragment);
+    EXPECT_EQ(in_payload_write.size, size_fragment);
+
+    auto it = buf.begin() + i * size_fragment;
+    EXPECT_EQ(std::vector<char>(it, it + size_fragment), payload_buf);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/image/image_test.go b/test/image/image_test.go
index ac6186688..968e62f63 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -63,8 +63,8 @@ func TestHelloWorld(t *testing.T) {
 	}
 }
 
-func runHTTPRequest(port int) error {
-	url := fmt.Sprintf("http://localhost:%d/not-found", port)
+func runHTTPRequest(ip string, port int) error {
+	url := fmt.Sprintf("http://%s:%d/not-found", ip, port)
 	resp, err := http.Get(url)
 	if err != nil {
 		return fmt.Errorf("error reaching http server: %v", err)
@@ -73,7 +73,7 @@ func runHTTPRequest(port int) error {
 		return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
 	}
 
-	url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
+	url = fmt.Sprintf("http://%s:%d/latin10k.txt", ip, port)
 	resp, err = http.Get(url)
 	if err != nil {
 		return fmt.Errorf("Error reaching http server: %v", err)
@@ -95,13 +95,13 @@ func runHTTPRequest(port int) error {
 	return nil
 }
 
-func testHTTPServer(t *testing.T, port int) {
+func testHTTPServer(t *testing.T, ip string, port int) {
 	const requests = 10
 	ch := make(chan error, requests)
 	for i := 0; i < requests; i++ {
 		go func() {
 			start := time.Now()
-			err := runHTTPRequest(port)
+			err := runHTTPRequest(ip, port)
 			log.Printf("Response time %v: %v", time.Since(start).String(), err)
 			ch <- err
 		}()
@@ -110,7 +110,7 @@ func testHTTPServer(t *testing.T, port int) {
 	for i := 0; i < requests; i++ {
 		err := <-ch
 		if err != nil {
-			t.Errorf("testHTTPServer(%d) failed: %v", port, err)
+			t.Errorf("testHTTPServer(%s, %d) failed: %v", ip, port, err)
 		}
 	}
 }
@@ -121,27 +121,28 @@ func TestHttpd(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the container.
+	port := 80
 	opts := dockerutil.RunOpts{
 		Image: "basic/httpd",
-		Ports: []int{80},
+		Ports: []int{port},
 	}
 	d.CopyFiles(&opts, "/usr/local/apache2/htdocs", "test/image/latin10k.txt")
 	if err := d.Spawn(ctx, opts); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Find where port 80 is mapped to.
-	port, err := d.FindPort(ctx, 80)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("FindPort(80) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Errorf("WaitForHTTP() timeout: %v", err)
 	}
 
-	testHTTPServer(t, port)
+	testHTTPServer(t, ip.String(), port)
 }
 
 func TestNginx(t *testing.T) {
@@ -150,27 +151,28 @@ func TestNginx(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the container.
+	port := 80
 	opts := dockerutil.RunOpts{
 		Image: "basic/nginx",
-		Ports: []int{80},
+		Ports: []int{port},
 	}
 	d.CopyFiles(&opts, "/usr/share/nginx/html", "test/image/latin10k.txt")
 	if err := d.Spawn(ctx, opts); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Find where port 80 is mapped to.
-	port, err := d.FindPort(ctx, 80)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("FindPort(80) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Errorf("WaitForHTTP() timeout: %v", err)
 	}
 
-	testHTTPServer(t, port)
+	testHTTPServer(t, ip.String(), port)
 }
 
 func TestMysql(t *testing.T) {
@@ -218,26 +220,27 @@ func TestTomcat(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Start the server.
+	port := 8080
 	if err := d.Spawn(ctx, dockerutil.RunOpts{
 		Image: "basic/tomcat",
-		Ports: []int{8080},
+		Ports: []int{port},
 	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(ctx, 8080)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("FindPort(8080) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
+	url := fmt.Sprintf("http://%s:%d", ip.String(), port)
 	resp, err := http.Get(url)
 	if err != nil {
 		t.Errorf("Error reaching http server: %v", err)
@@ -253,28 +256,29 @@ func TestRuby(t *testing.T) {
 	defer d.CleanUp(ctx)
 
 	// Execute the ruby workload.
+	port := 8080
 	opts := dockerutil.RunOpts{
 		Image: "basic/ruby",
-		Ports: []int{8080},
+		Ports: []int{port},
 	}
 	d.CopyFiles(&opts, "/src", "test/image/ruby.rb", "test/image/ruby.sh")
 	if err := d.Spawn(ctx, opts, "/src/ruby.sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(ctx, 8080)
+	// Find container IP address.
+	ip, err := d.FindIP(ctx, false)
 	if err != nil {
-		t.Fatalf("FindPort(8080) failed: %v", err)
+		t.Fatalf("docker.FindIP failed: %v", err)
 	}
 
 	// Wait until it's up and running, 'gem install' can take some time.
-	if err := testutil.WaitForHTTP(port, time.Minute); err != nil {
+	if err := testutil.WaitForHTTP(ip.String(), port, time.Minute); err != nil {
 		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
+	url := fmt.Sprintf("http://%s:%d", ip.String(), port)
 	resp, err := http.Get(url)
 	if err != nil {
 		t.Errorf("error reaching http server: %v", err)
diff --git a/test/iptables/README.md b/test/iptables/README.md
index b9f44bd40..28ab195ca 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -1,6 +1,6 @@
 # iptables Tests
 
-iptables tests are run via `scripts/iptables_test.sh`.
+iptables tests are run via `make iptables-tests`.
 
 iptables requires raw socket support, so you must add the `--net-raw=true` flag
 to `/etc/docker/daemon.json` in order to use it.
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
index 32bf2a992..d3e5efd4f 100644
--- a/test/iptables/filter_output.go
+++ b/test/iptables/filter_output.go
@@ -441,9 +441,20 @@ func (FilterOutputDestination) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterOutputDestination) ContainerAction(ctx context.Context, ip net.IP, ipv6 bool) error {
-	rules := [][]string{
-		{"-A", "OUTPUT", "-d", ip.String(), "-j", "ACCEPT"},
-		{"-P", "OUTPUT", "DROP"},
+	var rules [][]string
+	if ipv6 {
+		rules = [][]string{
+			{"-A", "OUTPUT", "-d", ip.String(), "-j", "ACCEPT"},
+			// Allow solicited node multicast addresses so we can send neighbor
+			// solicitations.
+			{"-A", "OUTPUT", "-d", "ff02::1:ff00:0/104", "-j", "ACCEPT"},
+			{"-P", "OUTPUT", "DROP"},
+		}
+	} else {
+		rules = [][]string{
+			{"-A", "OUTPUT", "-d", ip.String(), "-j", "ACCEPT"},
+			{"-P", "OUTPUT", "DROP"},
+		}
 	}
 	if err := filterTableRules(ipv6, rules); err != nil {
 		return err
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index e2beb30d5..834f7615f 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -72,11 +72,6 @@ func iptablesTest(t *testing.T, test TestCase, ipv6 bool) {
 		d.CleanUp(context.Background())
 	}()
 
-	// TODO(gvisor.dev/issue/170): Skipping IPv6 gVisor tests.
-	if ipv6 && dockerutil.Runtime() != "runc" {
-		t.Skip("gVisor ip6tables not yet implemented")
-	}
-
 	// Create and start the container.
 	opts := dockerutil.RunOpts{
 		Image:  "iptables",
@@ -314,11 +309,11 @@ func TestInputInvertDestination(t *testing.T) {
 	singleTest(t, FilterInputInvertDestination{})
 }
 
-func TestOutputDestination(t *testing.T) {
+func TestFilterOutputDestination(t *testing.T) {
 	singleTest(t, FilterOutputDestination{})
 }
 
-func TestOutputInvertDestination(t *testing.T) {
+func TestFilterOutputInvertDestination(t *testing.T) {
 	singleTest(t, FilterOutputInvertDestination{})
 }
 
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index dd9a18339..b98d99fb8 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -577,11 +577,18 @@ func listenForRedirectedConn(ctx context.Context, ipv6 bool, originalDsts []net.
 	connCh := make(chan int)
 	errCh := make(chan error)
 	go func() {
-		connFD, _, err := syscall.Accept(sockfd)
-		if err != nil {
-			errCh <- err
+		for {
+			connFD, _, err := syscall.Accept(sockfd)
+			if errors.Is(err, syscall.EINTR) {
+				continue
+			}
+			if err != nil {
+				errCh <- err
+				return
+			}
+			connCh <- connFD
+			return
 		}
-		connCh <- connFD
 	}()
 
 	// Wait for accept() to return or for the context to finish.
diff --git a/test/kubernetes/gvisor-injection-admission-webhook.yaml b/test/kubernetes/gvisor-injection-admission-webhook.yaml
new file mode 100644
index 000000000..691f02dda
--- /dev/null
+++ b/test/kubernetes/gvisor-injection-admission-webhook.yaml
@@ -0,0 +1,89 @@
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: e2e
+  labels:
+    name: e2e
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gvisor-injection-admission-webhook
+  namespace: e2e
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: gvisor-injection-admission-webhook
+rules:
+- apiGroups: [ admissionregistration.k8s.io ]
+  resources: [ mutatingwebhookconfigurations ]
+  verbs: [ create ]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gvisor-injection-admission-webhook
+  namespace: e2e
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gvisor-injection-admission-webhook
+subjects:
+- kind: ServiceAccount
+  name: gvisor-injection-admission-webhook
+  namespace: e2e
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gvisor-injection-admission-webhook
+  namespace: e2e
+  labels:
+    app: gvisor-injection-admission-webhook
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gvisor-injection-admission-webhook
+  template:
+    metadata:
+      labels:
+        app: gvisor-injection-admission-webhook
+    spec:
+      containers:
+      - name: webhook
+        image: gcr.io/gke-gvisor/gvisor-injection-admission-webhook:54ce9bd
+        args:
+        - --log-level=debug
+        ports:
+        - containerPort: 8443
+      serviceAccountName: gvisor-injection-admission-webhook
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: gvisor-injection-admission-webhook
+  namespace: e2e
+spec:
+  selector:
+    app: gvisor-injection-admission-webhook
+  ports:
+  - protocol: TCP
+    port: 443
+    targetPort: 8443
diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
index ffa96ba98..fe0976ba5 100644
--- a/test/packetimpact/README.md
+++ b/test/packetimpact/README.md
@@ -694,6 +694,13 @@ func TestMyTcpTest(t *testing.T) {
 }
 ```
 
+### Adding a new packetimpact test
+
+*   Create a go test in the [tests directory](tests/)
+*   Add a `packetimpact_testbench` rule in [BUILD](tests/BUILD)
+*   Add the test into the `ALL_TESTS` list in [defs.bzl](runner/defs.bzl),
+    otherwise you will see an error message complaining about a missing test.
+
 ## Other notes
 
 *   The time between receiving a SYN-ACK and replying with an ACK in `Handshake`
diff --git a/test/packetimpact/dut/BUILD b/test/packetimpact/dut/BUILD
index 3ce63c2c6..ccf1c735f 100644
--- a/test/packetimpact/dut/BUILD
+++ b/test/packetimpact/dut/BUILD
@@ -16,3 +16,13 @@ cc_binary(
         "//test/packetimpact/proto:posix_server_cc_proto",
     ],
 )
+
+cc_binary(
+    name = "posix_server_dynamic",
+    srcs = ["posix_server.cc"],
+    deps = [
+        grpcpp,
+        "//test/packetimpact/proto:posix_server_cc_grpc_proto",
+        "//test/packetimpact/proto:posix_server_cc_proto",
+    ],
+)
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index 29d4cc6fe..4de8540f6 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -21,6 +21,7 @@
 #include <string.h>
 #include <sys/socket.h>
 #include <sys/types.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <iostream>
@@ -28,6 +29,7 @@
 
 #include "include/grpcpp/security/server_credentials.h"
 #include "include/grpcpp/server_builder.h"
+#include "include/grpcpp/server_context.h"
 #include "test/packetimpact/proto/posix_server.grpc.pb.h"
 #include "test/packetimpact/proto/posix_server.pb.h"
 
@@ -108,18 +110,20 @@
 }
 
 class PosixImpl final : public posix_server::Posix::Service {
-  ::grpc::Status Accept(grpc_impl::ServerContext *context,
+  ::grpc::Status Accept(grpc::ServerContext *context,
                         const ::posix_server::AcceptRequest *request,
                         ::posix_server::AcceptResponse *response) override {
     sockaddr_storage addr;
     socklen_t addrlen = sizeof(addr);
     response->set_fd(accept(request->sockfd(),
                             reinterpret_cast<sockaddr *>(&addr), &addrlen));
-    response->set_errno_(errno);
+    if (response->fd() < 0) {
+      response->set_errno_(errno);
+    }
     return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
   }
 
-  ::grpc::Status Bind(grpc_impl::ServerContext *context,
+  ::grpc::Status Bind(grpc::ServerContext *context,
                       const ::posix_server::BindRequest *request,
                       ::posix_server::BindResponse *response) override {
     if (!request->has_addr()) {
@@ -136,19 +140,23 @@ class PosixImpl final : public posix_server::Posix::Service {
 
     response->set_ret(
         bind(request->sockfd(), reinterpret_cast<sockaddr *>(&addr), addr_len));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Close(grpc_impl::ServerContext *context,
+  ::grpc::Status Close(grpc::ServerContext *context,
                        const ::posix_server::CloseRequest *request,
                        ::posix_server::CloseResponse *response) override {
     response->set_ret(close(request->fd()));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Connect(grpc_impl::ServerContext *context,
+  ::grpc::Status Connect(grpc::ServerContext *context,
                          const ::posix_server::ConnectRequest *request,
                          ::posix_server::ConnectResponse *response) override {
     if (!request->has_addr()) {
@@ -164,32 +172,38 @@ class PosixImpl final : public posix_server::Posix::Service {
 
     response->set_ret(connect(request->sockfd(),
                               reinterpret_cast<sockaddr *>(&addr), addr_len));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Fcntl(grpc_impl::ServerContext *context,
+  ::grpc::Status Fcntl(grpc::ServerContext *context,
                        const ::posix_server::FcntlRequest *request,
                        ::posix_server::FcntlResponse *response) override {
     response->set_ret(::fcntl(request->fd(), request->cmd(), request->arg()));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
   ::grpc::Status GetSockName(
-      grpc_impl::ServerContext *context,
+      grpc::ServerContext *context,
       const ::posix_server::GetSockNameRequest *request,
       ::posix_server::GetSockNameResponse *response) override {
     sockaddr_storage addr;
     socklen_t addrlen = sizeof(addr);
     response->set_ret(getsockname(
         request->sockfd(), reinterpret_cast<sockaddr *>(&addr), &addrlen));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
   }
 
   ::grpc::Status GetSockOpt(
-      grpc_impl::ServerContext *context,
+      grpc::ServerContext *context,
       const ::posix_server::GetSockOptRequest *request,
       ::posix_server::GetSockOptResponse *response) override {
     switch (request->type()) {
@@ -226,15 +240,19 @@ class PosixImpl final : public posix_server::Posix::Service {
         return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
                               "Unknown SockOpt Type");
     }
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Listen(grpc_impl::ServerContext *context,
+  ::grpc::Status Listen(grpc::ServerContext *context,
                         const ::posix_server::ListenRequest *request,
                         ::posix_server::ListenResponse *response) override {
     response->set_ret(listen(request->sockfd(), request->backlog()));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
@@ -243,7 +261,9 @@ class PosixImpl final : public posix_server::Posix::Service {
                       ::posix_server::SendResponse *response) override {
     response->set_ret(::send(request->sockfd(), request->buf().data(),
                              request->buf().size(), request->flags()));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
@@ -264,12 +284,14 @@ class PosixImpl final : public posix_server::Posix::Service {
     response->set_ret(::sendto(request->sockfd(), request->buf().data(),
                                request->buf().size(), request->flags(),
                                reinterpret_cast<sockaddr *>(&addr), addr_len));
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
   ::grpc::Status SetSockOpt(
-      grpc_impl::ServerContext *context,
+      grpc::ServerContext *context,
       const ::posix_server::SetSockOptRequest *request,
       ::posix_server::SetSockOptResponse *response) override {
     switch (request->optval().val_case()) {
@@ -286,9 +308,9 @@ class PosixImpl final : public posix_server::Posix::Service {
         break;
       }
       case ::posix_server::SockOptVal::kTimeval: {
-        timeval tv = {.tv_sec = static_cast<__time_t>(
+        timeval tv = {.tv_sec = static_cast<time_t>(
                           request->optval().timeval().seconds()),
-                      .tv_usec = static_cast<__suseconds_t>(
+                      .tv_usec = static_cast<suseconds_t>(
                           request->optval().timeval().microseconds())};
         response->set_ret(setsockopt(request->sockfd(), request->level(),
                                      request->optname(), &tv, sizeof(tv)));
@@ -298,16 +320,29 @@ class PosixImpl final : public posix_server::Posix::Service {
         return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
                               "Unknown SockOpt Type");
     }
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
-  ::grpc::Status Socket(grpc_impl::ServerContext *context,
+  ::grpc::Status Socket(grpc::ServerContext *context,
                         const ::posix_server::SocketRequest *request,
                         ::posix_server::SocketResponse *response) override {
     response->set_fd(
         socket(request->domain(), request->type(), request->protocol()));
-    response->set_errno_(errno);
+    if (response->fd() < 0) {
+      response->set_errno_(errno);
+    }
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Shutdown(grpc::ServerContext *context,
+                          const ::posix_server::ShutdownRequest *request,
+                          ::posix_server::ShutdownResponse *response) override {
+    if (shutdown(request->fd(), request->how()) < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 
@@ -320,7 +355,9 @@ class PosixImpl final : public posix_server::Posix::Service {
     if (response->ret() >= 0) {
       response->set_buf(buf.data(), response->ret());
     }
-    response->set_errno_(errno);
+    if (response->ret() < 0) {
+      response->set_errno_(errno);
+    }
     return ::grpc::Status::OK;
   }
 };
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index ccd20b10d..f32ed54ef 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -188,6 +188,15 @@ message SocketResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message ShutdownRequest {
+  int32 fd = 1;
+  int32 how = 2;
+}
+
+message ShutdownResponse {
+  int32 errno_ = 1;  // "errno" may fail to compile in c++.
+}
+
 message RecvRequest {
   int32 sockfd = 1;
   int32 len = 2;
@@ -225,6 +234,8 @@ service Posix {
   rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
   // Call socket() on the DUT.
   rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call shutdown() on the DUT.
+  rpc Shutdown(ShutdownRequest) returns (ShutdownResponse);
   // Call recv() on the DUT.
   rpc Recv(RecvRequest) returns (RecvResponse);
 }
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
index ff2be9b30..605dd4972 100644
--- a/test/packetimpact/runner/BUILD
+++ b/test/packetimpact/runner/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "bzl_library", "go_test")
+load("//tools:defs.bzl", "bzl_library", "go_library", "go_test")
 
 package(
     default_visibility = ["//test/packetimpact:__subpackages__"],
@@ -7,21 +7,31 @@ package(
 
 go_test(
     name = "packetimpact_test",
-    srcs = ["packetimpact_test.go"],
+    srcs = [
+        "packetimpact_test.go",
+    ],
     tags = [
         # Not intended to be run directly.
         "local",
         "manual",
     ],
-    deps = [
-        "//pkg/test/dockerutil",
-        "//test/packetimpact/netdevs",
-        "@com_github_docker_docker//api/types/mount:go_default_library",
-    ],
+    deps = [":runner"],
 )
 
 bzl_library(
     name = "defs_bzl",
     srcs = ["defs.bzl"],
-    visibility = ["//visibility:private"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+)
+
+go_library(
+    name = "runner",
+    testonly = True,
+    srcs = ["dut.go"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//test/packetimpact/netdevs",
+        "@com_github_docker_docker//api/types/mount:go_default_library",
+    ],
 )
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 93a36c6c2..1546d0d51 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -23,8 +23,9 @@ def _packetimpact_test_impl(ctx):
     transitive_files = []
     if hasattr(ctx.attr._test_runner, "data_runfiles"):
         transitive_files.append(ctx.attr._test_runner.data_runfiles.files)
+    files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server
     runfiles = ctx.runfiles(
-        files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server_binary,
+        files = files,
         transitive_files = depset(transitive = transitive_files),
         collect_default = True,
         collect_data = True,
@@ -38,7 +39,7 @@ _packetimpact_test = rule(
             cfg = "target",
             default = ":packetimpact_test",
         ),
-        "_posix_server_binary": attr.label(
+        "_posix_server": attr.label(
             cfg = "target",
             default = "//test/packetimpact/dut:posix_server",
         ),
@@ -109,28 +110,15 @@ def packetimpact_netstack_test(
         **kwargs
     )
 
-def packetimpact_go_test(name, size = "small", pure = True, expect_native_failure = False, expect_netstack_failure = False, **kwargs):
+def packetimpact_go_test(name, expect_native_failure = False, expect_netstack_failure = False):
     """Add packetimpact tests written in go.
 
     Args:
         name: name of the test
-        size: size of the test
-        pure: make a static go binary
         expect_native_failure: the test must fail natively
         expect_netstack_failure: the test must fail for Netstack
-        **kwargs: all the other args, forwarded to go_test
     """
     testbench_binary = name + "_test"
-    go_test(
-        name = testbench_binary,
-        size = size,
-        pure = pure,
-        tags = [
-            "local",
-            "manual",
-        ],
-        **kwargs
-    )
     packetimpact_native_test(
         name = name,
         expect_failure = expect_native_failure,
@@ -141,3 +129,156 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_native_failur
         expect_failure = expect_netstack_failure,
         testbench_binary = testbench_binary,
     )
+
+def packetimpact_testbench(name, size = "small", pure = True, **kwargs):
+    """Build packetimpact testbench written in go.
+
+    Args:
+        name: name of the test
+        size: size of the test
+        pure: make a static go binary
+        **kwargs: all the other args, forwarded to go_test
+    """
+    go_test(
+        name = name + "_test",
+        size = size,
+        pure = pure,
+        nogo = False,  # FIXME(gvisor.dev/issue/3374): Not working with all build systems.
+        tags = [
+            "local",
+            "manual",
+        ],
+        **kwargs
+    )
+
+PacketimpactTestInfo = provider(
+    doc = "Provide information for packetimpact tests",
+    fields = ["name", "expect_netstack_failure"],
+)
+
+ALL_TESTS = [
+    PacketimpactTestInfo(
+        name = "fin_wait2_timeout",
+    ),
+    PacketimpactTestInfo(
+        name = "ipv4_id_uniqueness",
+    ),
+    PacketimpactTestInfo(
+        name = "udp_discard_mcast_source_addr",
+    ),
+    PacketimpactTestInfo(
+        name = "udp_recv_mcast_bcast",
+    ),
+    PacketimpactTestInfo(
+        name = "udp_any_addr_recv_unicast",
+    ),
+    PacketimpactTestInfo(
+        name = "udp_icmp_error_propagation",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_reordering",
+        # TODO(b/139368047): Fix netstack then remove the line below.
+        expect_netstack_failure = True,
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_window_shrink",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_zero_window_probe",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_zero_window_probe_retransmit",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_zero_window_probe_usertimeout",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_retransmits",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_outside_the_window",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_noaccept_close_rst",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_send_window_sizes_piggyback",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_unacc_seq_ack",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_paws_mechanism",
+        # TODO(b/156682000): Fix netstack then remove the line below.
+        expect_netstack_failure = True,
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_user_timeout",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_queue_receive_in_syn_sent",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_synsent_reset",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_synrcvd_reset",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_network_unreachable",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_cork_mss",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_handshake_window_size",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_timewait_reset",
+        # TODO(b/168523247): Fix netstack then remove the line below.
+        expect_netstack_failure = True,
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_queue_send_in_syn_sent",
+    ),
+    PacketimpactTestInfo(
+        name = "icmpv6_param_problem",
+        # TODO(b/153485026): Fix netstack then remove the line below.
+        expect_netstack_failure = True,
+    ),
+    PacketimpactTestInfo(
+        name = "ipv6_unknown_options_action",
+        # TODO(b/159928940): Fix netstack then remove the line below.
+        expect_netstack_failure = True,
+    ),
+    PacketimpactTestInfo(
+        name = "ipv6_fragment_reassembly",
+    ),
+    PacketimpactTestInfo(
+        name = "udp_send_recv_dgram",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_linger",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_rcv_buf_space",
+    ),
+]
+
+def validate_all_tests():
+    """
+    Make sure that ALL_TESTS list is in sync with the rules in BUILD.
+
+    This function is order-dependent, it is intended to be used after
+    all packetimpact_testbench rules and before using ALL_TESTS list
+    at the end of BUILD.
+    """
+    all_tests_dict = {}  # there is no set, using dict to approximate.
+    for test in ALL_TESTS:
+        rule_name = test.name + "_test"
+        all_tests_dict[rule_name] = True
+        if not native.existing_rule(rule_name):
+            fail("%s does not have a packetimpact_testbench rule in BUILD" % test.name)
+    for name in native.existing_rules():
+        if name.endswith("_test") and name not in all_tests_dict:
+            fail("%s is not declared in ALL_TESTS list in defs.bzl" % name[:-5])
diff --git a/test/packetimpact/runner/dut.go b/test/packetimpact/runner/dut.go
new file mode 100644
index 000000000..59bb68eb1
--- /dev/null
+++ b/test/packetimpact/runner/dut.go
@@ -0,0 +1,442 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package runner starts docker containers and networking for a packetimpact test.
+package runner
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/docker/docker/api/types/mount"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+// stringList implements flag.Value.
+type stringList []string
+
+// String implements flag.Value.String.
+func (l *stringList) String() string {
+	return strings.Join(*l, ",")
+}
+
+// Set implements flag.Value.Set.
+func (l *stringList) Set(value string) error {
+	*l = append(*l, value)
+	return nil
+}
+
+var (
+	native          = false
+	testbenchBinary = ""
+	tshark          = false
+	extraTestArgs   = stringList{}
+	expectFailure   = false
+
+	// DutAddr is the IP addres for DUT.
+	DutAddr       = net.IPv4(0, 0, 0, 10)
+	testbenchAddr = net.IPv4(0, 0, 0, 20)
+)
+
+// RegisterFlags defines flags and associates them with the package-level
+// exported variables above. It should be called by tests in their init
+// functions.
+func RegisterFlags(fs *flag.FlagSet) {
+	fs.BoolVar(&native, "native", false, "whether the test should be run natively")
+	fs.StringVar(&testbenchBinary, "testbench_binary", "", "path to the testbench binary")
+	fs.BoolVar(&tshark, "tshark", false, "use more verbose tshark in logs instead of tcpdump")
+	fs.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
+	fs.BoolVar(&expectFailure, "expect_failure", false, "expect that the test will fail when run")
+}
+
+// CtrlPort is the port that posix_server listens on.
+const CtrlPort = "40000"
+
+// logger implements testutil.Logger.
+//
+// Labels logs based on their source and formats multi-line logs.
+type logger string
+
+// Name implements testutil.Logger.Name.
+func (l logger) Name() string {
+	return string(l)
+}
+
+// Logf implements testutil.Logger.Logf.
+func (l logger) Logf(format string, args ...interface{}) {
+	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
+	log.Printf("%s: %s", l, lines[0])
+	for _, line := range lines[1:] {
+		log.Printf("%*s  %s", len(l), "", line)
+	}
+}
+
+// TestWithDUT runs a packetimpact test with the given information.
+func TestWithDUT(ctx context.Context, t *testing.T, mkDevice func(*dockerutil.Container) DUT, containerAddr net.IP) {
+	if testbenchBinary == "" {
+		t.Fatal("--testbench_binary is missing")
+	}
+	dockerutil.EnsureSupportedDockerVersion()
+
+	// Create the networks needed for the test. One control network is needed for
+	// the gRPC control packets and one test network on which to transmit the test
+	// packets.
+	ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet"))
+	testNet := dockerutil.NewNetwork(ctx, logger("testNet"))
+	for _, dn := range []*dockerutil.Network{ctrlNet, testNet} {
+		for {
+			if err := createDockerNetwork(ctx, dn); err != nil {
+				t.Log("creating docker network:", err)
+				const wait = 100 * time.Millisecond
+				t.Logf("sleeping %s and will try creating docker network again", wait)
+				// This can fail if another docker network claimed the same IP so we'll
+				// just try again.
+				time.Sleep(wait)
+				continue
+			}
+			break
+		}
+		dn := dn
+		t.Cleanup(func() {
+			if err := dn.Cleanup(ctx); err != nil {
+				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
+			}
+		})
+		// Sanity check.
+		if inspect, err := dn.Inspect(ctx); err != nil {
+			t.Fatalf("failed to inspect network %s: %v", dn.Name, err)
+		} else if inspect.Name != dn.Name {
+			t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name)
+		}
+	}
+
+	tmpDir, err := ioutil.TempDir("", "container-output")
+	if err != nil {
+		t.Fatal("creating temp dir:", err)
+	}
+	t.Cleanup(func() {
+		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
+			t.Errorf("unable to copy container output files: %s", err)
+		}
+		if err := os.RemoveAll(tmpDir); err != nil {
+			t.Errorf("failed to remove tmpDir %s: %s", tmpDir, err)
+		}
+	})
+
+	const testOutputDir = "/tmp/testoutput"
+
+	// Create the Docker container for the DUT.
+	var dut *dockerutil.Container
+	if native {
+		dut = dockerutil.MakeNativeContainer(ctx, logger("dut"))
+	} else {
+		dut = dockerutil.MakeContainer(ctx, logger("dut"))
+	}
+	t.Cleanup(func() {
+		dut.CleanUp(ctx)
+	})
+
+	runOpts := dockerutil.RunOpts{
+		Image:  "packetimpact",
+		CapAdd: []string{"NET_ADMIN"},
+		Mounts: []mount.Mount{{
+			Type:     mount.TypeBind,
+			Source:   tmpDir,
+			Target:   testOutputDir,
+			ReadOnly: false,
+		}},
+	}
+
+	device := mkDevice(dut)
+	remoteIPv6, remoteMAC, dutDeviceID, dutTestNetDev := device.Prepare(ctx, t, runOpts, ctrlNet, testNet, containerAddr)
+
+	// Create the Docker container for the testbench.
+	testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench"))
+
+	tbb := path.Base(testbenchBinary)
+	containerTestbenchBinary := filepath.Join("/packetimpact", tbb)
+	testbench.CopyFiles(&runOpts, "/packetimpact", filepath.Join("test/packetimpact/tests", tbb))
+
+	// snifferNetDev is a network device on the test orchestrator that we will
+	// run sniffer (tcpdump or tshark) on and inject traffic to, not to be
+	// confused with the device on the DUT.
+	const snifferNetDev = "eth2"
+	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
+	// the interface with the test packets.
+	snifferArgs := []string{
+		"tcpdump",
+		"-S", "-vvv", "-U", "-n",
+		"-i", snifferNetDev,
+		"-w", testOutputDir + "/dump.pcap",
+	}
+	snifferRegex := "tcpdump: listening.*\n"
+	if tshark {
+		// Run tshark in the test bench unbuffered, without DNS resolution, just on
+		// the interface with the test packets.
+		snifferArgs = []string{
+			"tshark", "-V", "-l", "-n", "-i", snifferNetDev,
+			"-o", "tcp.check_checksum:TRUE",
+			"-o", "udp.check_checksum:TRUE",
+		}
+		snifferRegex = "Capturing on.*\n"
+	}
+
+	if err := StartContainer(
+		ctx,
+		runOpts,
+		testbench,
+		testbenchAddr,
+		[]*dockerutil.Network{ctrlNet, testNet},
+		snifferArgs...,
+	); err != nil {
+		t.Fatalf("failed to start docker container for testbench sniffer: %s", err)
+	}
+	// Kill so that it will flush output.
+	t.Cleanup(func() {
+		time.Sleep(1 * time.Second)
+		testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0])
+	})
+
+	if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil {
+		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
+	}
+
+	// When the Linux kernel receives a SYN-ACK for a SYN it didn't send, it
+	// will respond with an RST. In most packetimpact tests, the SYN is sent
+	// by the raw socket and the kernel knows nothing about the connection, this
+	// behavior will break lots of TCP related packetimpact tests. To prevent
+	// this, we can install the following iptables rules. The raw socket that
+	// packetimpact tests use will still be able to see everything.
+	for _, bin := range []string{"iptables", "ip6tables"} {
+		if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", snifferNetDev, "-p", "tcp", "-j", "DROP"); err != nil {
+			t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs)
+		}
+	}
+
+	// FIXME(b/156449515): Some piece of the system has a race. The old
+	// bash script version had a sleep, so we have one too. The race should
+	// be fixed and this sleep removed.
+	time.Sleep(time.Second)
+
+	// Start a packetimpact test on the test bench. The packetimpact test sends
+	// and receives packets and also sends POSIX socket commands to the
+	// posix_server to be executed on the DUT.
+	testArgs := []string{containerTestbenchBinary}
+	testArgs = append(testArgs, extraTestArgs...)
+	testArgs = append(testArgs,
+		"--posix_server_ip", AddressInSubnet(DutAddr, *ctrlNet.Subnet).String(),
+		"--posix_server_port", CtrlPort,
+		"--remote_ipv4", AddressInSubnet(DutAddr, *testNet.Subnet).String(),
+		"--local_ipv4", AddressInSubnet(testbenchAddr, *testNet.Subnet).String(),
+		"--remote_ipv6", remoteIPv6.String(),
+		"--remote_mac", remoteMAC.String(),
+		"--remote_interface_id", fmt.Sprintf("%d", dutDeviceID),
+		"--local_device", snifferNetDev,
+		"--remote_device", dutTestNetDev,
+		fmt.Sprintf("--native=%t", native),
+	)
+	testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...)
+	if (err != nil) != expectFailure {
+		var dutLogs string
+		if logs, err := device.Logs(ctx); err != nil {
+			dutLogs = fmt.Sprintf("failed to fetch DUT logs: %s", err)
+		} else {
+			dutLogs = logs
+		}
+
+		t.Errorf(`test error: %v, expect failure: %t
+
+%s
+
+====== Begin of Testbench Logs ======
+
+%s
+
+====== End of Testbench Logs ======`,
+			err, expectFailure, dutLogs, testbenchLogs)
+	}
+}
+
+// DUT describes how to setup/teardown the dut for packetimpact tests.
+type DUT interface {
+	// Prepare prepares the dut, starts posix_server and returns the IPv6, MAC
+	// address, the interface ID, and the interface name for the testNet on DUT.
+	Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32, string)
+	// Logs retrieves the logs from the dut.
+	Logs(ctx context.Context) (string, error)
+}
+
+// DockerDUT describes a docker based DUT.
+type DockerDUT struct {
+	c *dockerutil.Container
+}
+
+// NewDockerDUT creates a docker based DUT.
+func NewDockerDUT(c *dockerutil.Container) DUT {
+	return &DockerDUT{
+		c: c,
+	}
+}
+
+// Prepare implements DUT.Prepare.
+func (dut *DockerDUT) Prepare(ctx context.Context, t *testing.T, runOpts dockerutil.RunOpts, ctrlNet, testNet *dockerutil.Network, containerAddr net.IP) (net.IP, net.HardwareAddr, uint32, string) {
+	const containerPosixServerBinary = "/packetimpact/posix_server"
+	dut.c.CopyFiles(&runOpts, "/packetimpact", "test/packetimpact/dut/posix_server")
+
+	if err := StartContainer(
+		ctx,
+		runOpts,
+		dut.c,
+		containerAddr,
+		[]*dockerutil.Network{ctrlNet, testNet},
+		containerPosixServerBinary,
+		"--ip=0.0.0.0",
+		"--port="+CtrlPort,
+	); err != nil {
+		t.Fatalf("failed to start docker container for DUT: %s", err)
+	}
+
+	if _, err := dut.c.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil {
+		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.c.Name, err)
+	}
+
+	dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	remoteMAC := dutDeviceInfo.MAC
+	remoteIPv6 := dutDeviceInfo.IPv6Addr
+	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+	// needed.
+	if remoteIPv6 == nil {
+		if _, err := dut.c.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+			t.Fatalf("unable to ip addr add on container %s: %s", dut.c.Name, err)
+		}
+		// Now try again, to make sure that it worked.
+		_, dutDeviceInfo, err = deviceByIP(ctx, dut.c, AddressInSubnet(containerAddr, *testNet.Subnet))
+		if err != nil {
+			t.Fatal(err)
+		}
+		remoteIPv6 = dutDeviceInfo.IPv6Addr
+		if remoteIPv6 == nil {
+			t.Fatalf("unable to set IPv6 address on container %s", dut.c.Name)
+		}
+	}
+	const testNetDev = "eth2"
+
+	return remoteIPv6, dutDeviceInfo.MAC, dutDeviceInfo.ID, testNetDev
+}
+
+// Logs implements DUT.Logs.
+func (dut *DockerDUT) Logs(ctx context.Context) (string, error) {
+	logs, err := dut.c.Logs(ctx)
+	if err != nil {
+		return "", err
+	}
+	return fmt.Sprintf(`====== Begin of DUT Logs ======
+
+%s
+
+====== End of DUT Logs ======`, logs), nil
+}
+
+// AddNetworks connects docker network with the container and assigns the specific IP.
+func AddNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, networks []*dockerutil.Network) error {
+	for _, dn := range networks {
+		ip := AddressInSubnet(addr, *dn.Subnet)
+		// Connect to the network with the specified IP address.
+		if err := dn.Connect(ctx, d, ip.String(), ""); err != nil {
+			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
+		}
+	}
+	return nil
+}
+
+// AddressInSubnet combines the subnet provided with the address and returns a
+// new address. The return address bits come from the subnet where the mask is 1
+// and from the ip address where the mask is 0.
+func AddressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
+	var octets []byte
+	for i := 0; i < 4; i++ {
+		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
+	}
+	return net.IP(octets)
+}
+
+// deviceByIP finds a deviceInfo and device name from an IP address.
+func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) {
+	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show")
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w\n%s", d.Name, err, out)
+	}
+	devs, err := netdevs.ParseDevices(out)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w\n%s", d.Name, err, out)
+	}
+	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
+	}
+	return testDevice, deviceInfo, nil
+}
+
+// createDockerNetwork makes a randomly-named network that will start with the
+// namePrefix. The network will be a random /24 subnet.
+func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error {
+	randSource := rand.NewSource(time.Now().UnixNano())
+	r1 := rand.New(randSource)
+	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
+	n.Subnet = &net.IPNet{
+		IP:   ip,
+		Mask: ip.DefaultMask(),
+	}
+	return n.Create(ctx)
+}
+
+// StartContainer will create a container instance from runOpts, connect it
+// with the specified docker networks and start executing the specified cmd.
+func StartContainer(ctx context.Context, runOpts dockerutil.RunOpts, c *dockerutil.Container, containerAddr net.IP, ns []*dockerutil.Network, cmd ...string) error {
+	conf, hostconf, netconf := c.ConfigsFrom(runOpts, cmd...)
+	_ = netconf
+	hostconf.AutoRemove = true
+	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
+
+	if err := c.CreateFrom(ctx, conf, hostconf, nil); err != nil {
+		return fmt.Errorf("unable to create container %s: %w", c.Name, err)
+	}
+
+	if err := AddNetworks(ctx, c, containerAddr, ns); err != nil {
+		return fmt.Errorf("unable to connect the container with the networks: %w", err)
+	}
+
+	if err := c.Start(ctx); err != nil {
+		return fmt.Errorf("unable to start container %s: %w", c.Name, err)
+	}
+	return nil
+}
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
index e8c183977..c598bfc29 100644
--- a/test/packetimpact/runner/packetimpact_test.go
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -18,366 +18,15 @@ package packetimpact_test
 import (
 	"context"
 	"flag"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"math/rand"
-	"net"
-	"os"
-	"os/exec"
-	"path"
-	"strings"
 	"testing"
-	"time"
 
-	"github.com/docker/docker/api/types/mount"
-	"gvisor.dev/gvisor/pkg/test/dockerutil"
-	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+	"gvisor.dev/gvisor/test/packetimpact/runner"
 )
 
-// stringList implements flag.Value.
-type stringList []string
-
-// String implements flag.Value.String.
-func (l *stringList) String() string {
-	return strings.Join(*l, ",")
-}
-
-// Set implements flag.Value.Set.
-func (l *stringList) Set(value string) error {
-	*l = append(*l, value)
-	return nil
-}
-
-var (
-	native          = flag.Bool("native", false, "whether the test should be run natively")
-	testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary")
-	tshark          = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump")
-	extraTestArgs   = stringList{}
-	expectFailure   = flag.Bool("expect_failure", false, "expect that the test will fail when run")
-
-	dutAddr       = net.IPv4(0, 0, 0, 10)
-	testbenchAddr = net.IPv4(0, 0, 0, 20)
-)
-
-const ctrlPort = "40000"
-
-// logger implements testutil.Logger.
-//
-// Labels logs based on their source and formats multi-line logs.
-type logger string
-
-// Name implements testutil.Logger.Name.
-func (l logger) Name() string {
-	return string(l)
-}
-
-// Logf implements testutil.Logger.Logf.
-func (l logger) Logf(format string, args ...interface{}) {
-	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
-	log.Printf("%s: %s", l, lines[0])
-	for _, line := range lines[1:] {
-		log.Printf("%*s  %s", len(l), "", line)
-	}
+func init() {
+	runner.RegisterFlags(flag.CommandLine)
 }
 
 func TestOne(t *testing.T) {
-	flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
-	flag.Parse()
-	if *testbenchBinary == "" {
-		t.Fatal("--testbench_binary is missing")
-	}
-	dockerutil.EnsureSupportedDockerVersion()
-	ctx := context.Background()
-
-	// Create the networks needed for the test. One control network is needed for
-	// the gRPC control packets and one test network on which to transmit the test
-	// packets.
-	ctrlNet := dockerutil.NewNetwork(ctx, logger("ctrlNet"))
-	testNet := dockerutil.NewNetwork(ctx, logger("testNet"))
-	for _, dn := range []*dockerutil.Network{ctrlNet, testNet} {
-		for {
-			if err := createDockerNetwork(ctx, dn); err != nil {
-				t.Log("creating docker network:", err)
-				const wait = 100 * time.Millisecond
-				t.Logf("sleeping %s and will try creating docker network again", wait)
-				// This can fail if another docker network claimed the same IP so we'll
-				// just try again.
-				time.Sleep(wait)
-				continue
-			}
-			break
-		}
-		defer func(dn *dockerutil.Network) {
-			if err := dn.Cleanup(ctx); err != nil {
-				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
-			}
-		}(dn)
-		// Sanity check.
-		inspect, err := dn.Inspect(ctx)
-		if err != nil {
-			t.Fatalf("failed to inspect network %s: %v", dn.Name, err)
-		} else if inspect.Name != dn.Name {
-			t.Fatalf("name mismatch for network want: %s got: %s", dn.Name, inspect.Name)
-		}
-
-	}
-
-	tmpDir, err := ioutil.TempDir("", "container-output")
-	if err != nil {
-		t.Fatal("creating temp dir:", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	const testOutputDir = "/tmp/testoutput"
-
-	// Create the Docker container for the DUT.
-	var dut *dockerutil.Container
-	if *native {
-		dut = dockerutil.MakeNativeContainer(ctx, logger("dut"))
-	} else {
-		dut = dockerutil.MakeContainer(ctx, logger("dut"))
-	}
-
-	runOpts := dockerutil.RunOpts{
-		Image:  "packetimpact",
-		CapAdd: []string{"NET_ADMIN"},
-		Mounts: []mount.Mount{mount.Mount{
-			Type:     mount.TypeBind,
-			Source:   tmpDir,
-			Target:   testOutputDir,
-			ReadOnly: false,
-		}},
-	}
-
-	const containerPosixServerBinary = "/packetimpact/posix_server"
-	dut.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/dut/posix_server")
-
-	conf, hostconf, _ := dut.ConfigsFrom(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort)
-	hostconf.AutoRemove = true
-	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
-
-	if err := dut.CreateFrom(ctx, conf, hostconf, nil); err != nil {
-		t.Fatalf("unable to create container %s: %v", dut.Name, err)
-	}
-
-	defer dut.CleanUp(ctx)
-
-	// Add ctrlNet as eth1 and testNet as eth2.
-	const testNetDev = "eth2"
-	if err := addNetworks(ctx, dut, dutAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := dut.Start(ctx); err != nil {
-		t.Fatalf("unable to start container %s: %s", dut.Name, err)
-	}
-
-	if _, err := dut.WaitForOutput(ctx, "Server listening.*\n", 60*time.Second); err != nil {
-		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
-	}
-
-	dutTestDevice, dutDeviceInfo, err := deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	remoteMAC := dutDeviceInfo.MAC
-	remoteIPv6 := dutDeviceInfo.IPv6Addr
-	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
-	// needed.
-	if remoteIPv6 == nil {
-		if _, err := dut.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
-			t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
-		}
-		// Now try again, to make sure that it worked.
-		_, dutDeviceInfo, err = deviceByIP(ctx, dut, addressInSubnet(dutAddr, *testNet.Subnet))
-		if err != nil {
-			t.Fatal(err)
-		}
-		remoteIPv6 = dutDeviceInfo.IPv6Addr
-		if remoteIPv6 == nil {
-			t.Fatal("unable to set IPv6 address on container", dut.Name)
-		}
-	}
-
-	// Create the Docker container for the testbench.
-	testbench := dockerutil.MakeNativeContainer(ctx, logger("testbench"))
-
-	tbb := path.Base(*testbenchBinary)
-	containerTestbenchBinary := "/packetimpact/" + tbb
-	runOpts = dockerutil.RunOpts{
-		Image:  "packetimpact",
-		CapAdd: []string{"NET_ADMIN"},
-		Mounts: []mount.Mount{mount.Mount{
-			Type:     mount.TypeBind,
-			Source:   tmpDir,
-			Target:   testOutputDir,
-			ReadOnly: false,
-		}},
-	}
-	testbench.CopyFiles(&runOpts, "/packetimpact", "/test/packetimpact/tests/"+tbb)
-
-	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
-	// the interface with the test packets.
-	snifferArgs := []string{
-		"tcpdump",
-		"-S", "-vvv", "-U", "-n",
-		"-i", testNetDev,
-		"-w", testOutputDir + "/dump.pcap",
-	}
-	snifferRegex := "tcpdump: listening.*\n"
-	if *tshark {
-		// Run tshark in the test bench unbuffered, without DNS resolution, just on
-		// the interface with the test packets.
-		snifferArgs = []string{
-			"tshark", "-V", "-l", "-n", "-i", testNetDev,
-			"-o", "tcp.check_checksum:TRUE",
-			"-o", "udp.check_checksum:TRUE",
-		}
-		snifferRegex = "Capturing on.*\n"
-	}
-
-	defer func() {
-		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
-			t.Error("unable to copy container output files:", err)
-		}
-	}()
-
-	conf, hostconf, _ = testbench.ConfigsFrom(runOpts, snifferArgs...)
-	hostconf.AutoRemove = true
-	hostconf.Sysctls = map[string]string{"net.ipv6.conf.all.disable_ipv6": "0"}
-
-	if err := testbench.CreateFrom(ctx, conf, hostconf, nil); err != nil {
-		t.Fatalf("unable to create container %s: %s", testbench.Name, err)
-	}
-	defer testbench.CleanUp(ctx)
-
-	// Add ctrlNet as eth1 and testNet as eth2.
-	if err := addNetworks(ctx, testbench, testbenchAddr, []*dockerutil.Network{ctrlNet, testNet}); err != nil {
-		t.Fatal(err)
-	}
-
-	if err := testbench.Start(ctx); err != nil {
-		t.Fatalf("unable to start container %s: %s", testbench.Name, err)
-	}
-
-	// Kill so that it will flush output.
-	defer func() {
-		time.Sleep(1 * time.Second)
-		testbench.Exec(ctx, dockerutil.ExecOpts{}, "killall", snifferArgs[0])
-	}()
-
-	if _, err := testbench.WaitForOutput(ctx, snifferRegex, 60*time.Second); err != nil {
-		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
-	}
-
-	// Because the Linux kernel receives the SYN-ACK but didn't send the SYN it
-	// will issue an RST. To prevent this IPtables can be used to filter out all
-	// incoming packets. The raw socket that packetimpact tests use will still see
-	// everything.
-	for _, bin := range []string{"iptables", "ip6tables"} {
-		if logs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, bin, "-A", "INPUT", "-i", testNetDev, "-p", "tcp", "-j", "DROP"); err != nil {
-			t.Fatalf("unable to Exec %s on container %s: %s, logs from testbench:\n%s", bin, testbench.Name, err, logs)
-		}
-	}
-
-	// FIXME(b/156449515): Some piece of the system has a race. The old
-	// bash script version had a sleep, so we have one too. The race should
-	// be fixed and this sleep removed.
-	time.Sleep(time.Second)
-
-	// Start a packetimpact test on the test bench. The packetimpact test sends
-	// and receives packets and also sends POSIX socket commands to the
-	// posix_server to be executed on the DUT.
-	testArgs := []string{containerTestbenchBinary}
-	testArgs = append(testArgs, extraTestArgs...)
-	testArgs = append(testArgs,
-		"--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(),
-		"--posix_server_port", ctrlPort,
-		"--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(),
-		"--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(),
-		"--remote_ipv6", remoteIPv6.String(),
-		"--remote_mac", remoteMAC.String(),
-		"--remote_interface_id", fmt.Sprintf("%d", dutDeviceInfo.ID),
-		"--device", testNetDev,
-		fmt.Sprintf("--native=%t", *native),
-	)
-	testbenchLogs, err := testbench.Exec(ctx, dockerutil.ExecOpts{}, testArgs...)
-	if (err != nil) != *expectFailure {
-		var dutLogs string
-		if logs, err := dut.Logs(ctx); err != nil {
-			dutLogs = fmt.Sprintf("failed to fetch DUT logs: %s", err)
-		} else {
-			dutLogs = logs
-		}
-
-		t.Errorf(`test error: %v, expect failure: %t
-
-====== Begin of DUT Logs ======
-
-%s
-
-====== End of DUT Logs ======
-
-====== Begin of Testbench Logs ======
-
-%s
-
-====== End of Testbench Logs ======`,
-			err, *expectFailure, dutLogs, testbenchLogs)
-	}
-}
-
-func addNetworks(ctx context.Context, d *dockerutil.Container, addr net.IP, networks []*dockerutil.Network) error {
-	for _, dn := range networks {
-		ip := addressInSubnet(addr, *dn.Subnet)
-		// Connect to the network with the specified IP address.
-		if err := dn.Connect(ctx, d, ip.String(), ""); err != nil {
-			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
-		}
-	}
-	return nil
-}
-
-// addressInSubnet combines the subnet provided with the address and returns a
-// new address. The return address bits come from the subnet where the mask is 1
-// and from the ip address where the mask is 0.
-func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
-	var octets []byte
-	for i := 0; i < 4; i++ {
-		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
-	}
-	return net.IP(octets)
-}
-
-// createDockerNetwork makes a randomly-named network that will start with the
-// namePrefix. The network will be a random /24 subnet.
-func createDockerNetwork(ctx context.Context, n *dockerutil.Network) error {
-	randSource := rand.NewSource(time.Now().UnixNano())
-	r1 := rand.New(randSource)
-	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
-	n.Subnet = &net.IPNet{
-		IP:   ip,
-		Mask: ip.DefaultMask(),
-	}
-	return n.Create(ctx)
-}
-
-// deviceByIP finds a deviceInfo and device name from an IP address.
-func deviceByIP(ctx context.Context, d *dockerutil.Container, ip net.IP) (string, netdevs.DeviceInfo, error) {
-	out, err := d.Exec(ctx, dockerutil.ExecOpts{}, "ip", "addr", "show")
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err)
-	}
-	devs, err := netdevs.ParseDevices(out)
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w", d.Name, err)
-	}
-	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
-	if err != nil {
-		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
-	}
-	return testDevice, deviceInfo, nil
+	runner.TestWithDUT(context.Background(), t, runner.NewDockerDUT, runner.DutAddr)
 }
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 3af5f83fd..a90046f69 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -615,7 +615,7 @@ func (conn *Connection) ExpectFrame(t *testing.T, layers Layers, timeout time.Du
 			if errs == nil {
 				return nil, fmt.Errorf("got no frames matching %v during %s", layers, timeout)
 			}
-			return nil, fmt.Errorf("got no frames matching %v during %s: got %w", layers, timeout, errs)
+			return nil, fmt.Errorf("got frames %w want %v during %s", errs, layers, timeout)
 		}
 		if conn.match(layers, gotLayers) {
 			for i, s := range conn.layerStates {
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index 73c532e75..6165ab293 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -16,11 +16,13 @@ package testbench
 
 import (
 	"context"
+	"encoding/binary"
 	"flag"
 	"net"
 	"strconv"
 	"syscall"
 	"testing"
+	"time"
 
 	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
 
@@ -700,3 +702,43 @@ func (dut *DUT) RecvWithErrno(ctx context.Context, t *testing.T, sockfd, len, fl
 	}
 	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
 }
+
+// SetSockLingerOption sets SO_LINGER socket option on the DUT.
+func (dut *DUT) SetSockLingerOption(t *testing.T, sockfd int32, timeout time.Duration, enable bool) {
+	var linger unix.Linger
+	if enable {
+		linger.Onoff = 1
+	}
+	linger.Linger = int32(timeout / time.Second)
+
+	buf := make([]byte, 8)
+	binary.LittleEndian.PutUint32(buf, uint32(linger.Onoff))
+	binary.LittleEndian.PutUint32(buf[4:], uint32(linger.Linger))
+	dut.SetSockOpt(t, sockfd, unix.SOL_SOCKET, unix.SO_LINGER, buf)
+}
+
+// Shutdown calls shutdown on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// ShutdownWithErrno.
+func (dut *DUT) Shutdown(t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	return dut.ShutdownWithErrno(ctx, t, fd, how)
+}
+
+// ShutdownWithErrno calls shutdown on the DUT.
+func (dut *DUT) ShutdownWithErrno(ctx context.Context, t *testing.T, fd, how int32) error {
+	t.Helper()
+
+	req := pb.ShutdownRequest{
+		Fd:  fd,
+		How: how,
+	}
+	resp, err := dut.posixServer.Shutdown(ctx, &req)
+	if err != nil {
+		t.Fatalf("failed to call Shutdown: %s", err)
+	}
+	return syscall.Errno(resp.GetErrno_())
+}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 57e822725..193bb2dc8 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -139,7 +139,7 @@ type Injector struct {
 func NewInjector(t *testing.T) (Injector, error) {
 	t.Helper()
 
-	ifInfo, err := net.InterfaceByName(Device)
+	ifInfo, err := net.InterfaceByName(LocalDevice)
 	if err != nil {
 		return Injector{}, err
 	}
diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go
index e3629e1f3..3c85ebbee 100644
--- a/test/packetimpact/testbench/testbench.go
+++ b/test/packetimpact/testbench/testbench.go
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package testbench is the packetimpact test API.
 package testbench
 
 import (
@@ -29,8 +30,11 @@ import (
 var (
 	// Native indicates that the test is being run natively.
 	Native = false
-	// Device is the local device on the test network.
-	Device = ""
+	// LocalDevice is the device that testbench uses to inject traffic.
+	LocalDevice = ""
+	// RemoteDevice is the device name on the DUT, individual tests can
+	// use the name to construct tests.
+	RemoteDevice = ""
 
 	// LocalIPv4 is the local IPv4 address on the test network.
 	LocalIPv4 = ""
@@ -80,7 +84,8 @@ func RegisterFlags(fs *flag.FlagSet) {
 	fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets")
 	fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets")
 	fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets")
-	fs.StringVar(&Device, "device", Device, "local device for test packets")
+	fs.StringVar(&LocalDevice, "local_device", LocalDevice, "local device to inject traffic")
+	fs.StringVar(&RemoteDevice, "remote_device", RemoteDevice, "remote device on the DUT")
 	fs.BoolVar(&Native, "native", Native, "whether the test is running natively")
 	fs.Uint64Var(&RemoteInterfaceID, "remote_interface_id", RemoteInterfaceID, "remote interface ID for test packets")
 }
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 74658fea0..8c2de5a9f 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -1,11 +1,11 @@
-load("//test/packetimpact/runner:defs.bzl", "packetimpact_go_test")
+load("//test/packetimpact/runner:defs.bzl", "ALL_TESTS", "packetimpact_go_test", "packetimpact_testbench", "validate_all_tests")
 
 package(
     default_visibility = ["//test/packetimpact:__subpackages__"],
     licenses = ["notice"],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "fin_wait2_timeout",
     srcs = ["fin_wait2_timeout_test.go"],
     deps = [
@@ -15,7 +15,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "ipv4_id_uniqueness",
     srcs = ["ipv4_id_uniqueness_test.go"],
     deps = [
@@ -26,7 +26,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "udp_discard_mcast_source_addr",
     srcs = ["udp_discard_mcast_source_addr_test.go"],
     deps = [
@@ -37,7 +37,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "udp_recv_mcast_bcast",
     srcs = ["udp_recv_mcast_bcast_test.go"],
     deps = [
@@ -49,7 +49,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "udp_any_addr_recv_unicast",
     srcs = ["udp_any_addr_recv_unicast_test.go"],
     deps = [
@@ -60,7 +60,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "udp_icmp_error_propagation",
     srcs = ["udp_icmp_error_propagation_test.go"],
     deps = [
@@ -71,11 +71,9 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_reordering",
     srcs = ["tcp_reordering_test.go"],
-    # TODO(b/139368047): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
@@ -84,7 +82,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_window_shrink",
     srcs = ["tcp_window_shrink_test.go"],
     deps = [
@@ -94,7 +92,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_zero_window_probe",
     srcs = ["tcp_zero_window_probe_test.go"],
     deps = [
@@ -104,7 +102,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_zero_window_probe_retransmit",
     srcs = ["tcp_zero_window_probe_retransmit_test.go"],
     deps = [
@@ -114,7 +112,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_zero_window_probe_usertimeout",
     srcs = ["tcp_zero_window_probe_usertimeout_test.go"],
     deps = [
@@ -124,7 +122,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_retransmits",
     srcs = ["tcp_retransmits_test.go"],
     deps = [
@@ -134,7 +132,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_outside_the_window",
     srcs = ["tcp_outside_the_window_test.go"],
     deps = [
@@ -145,7 +143,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_noaccept_close_rst",
     srcs = ["tcp_noaccept_close_rst_test.go"],
     deps = [
@@ -155,7 +153,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_send_window_sizes_piggyback",
     srcs = ["tcp_send_window_sizes_piggyback_test.go"],
     deps = [
@@ -165,9 +163,9 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
-    name = "tcp_close_wait_ack",
-    srcs = ["tcp_close_wait_ack_test.go"],
+packetimpact_testbench(
+    name = "tcp_unacc_seq_ack",
+    srcs = ["tcp_unacc_seq_ack_test.go"],
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
@@ -176,11 +174,9 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_paws_mechanism",
     srcs = ["tcp_paws_mechanism_test.go"],
-    # TODO(b/156682000): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
@@ -189,7 +185,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_user_timeout",
     srcs = ["tcp_user_timeout_test.go"],
     deps = [
@@ -199,7 +195,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_queue_receive_in_syn_sent",
     srcs = ["tcp_queue_receive_in_syn_sent_test.go"],
     deps = [
@@ -209,7 +205,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_synsent_reset",
     srcs = ["tcp_synsent_reset_test.go"],
     deps = [
@@ -219,7 +215,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_synrcvd_reset",
     srcs = ["tcp_synrcvd_reset_test.go"],
     deps = [
@@ -229,7 +225,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_network_unreachable",
     srcs = ["tcp_network_unreachable_test.go"],
     deps = [
@@ -239,7 +235,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_cork_mss",
     srcs = ["tcp_cork_mss_test.go"],
     deps = [
@@ -249,7 +245,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "tcp_handshake_window_size",
     srcs = ["tcp_handshake_window_size_test.go"],
     deps = [
@@ -259,11 +255,29 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
+    name = "tcp_timewait_reset",
+    srcs = ["tcp_timewait_reset_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_testbench(
+    name = "tcp_queue_send_in_syn_sent",
+    srcs = ["tcp_queue_send_in_syn_sent_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_testbench(
     name = "icmpv6_param_problem",
     srcs = ["icmpv6_param_problem_test.go"],
-    # TODO(b/153485026): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
@@ -272,11 +286,9 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "ipv6_unknown_options_action",
     srcs = ["ipv6_unknown_options_action_test.go"],
-    # TODO(b/159928940): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
@@ -285,11 +297,9 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "ipv6_fragment_reassembly",
     srcs = ["ipv6_fragment_reassembly_test.go"],
-    # TODO(b/160919104): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
@@ -299,7 +309,7 @@ packetimpact_go_test(
     ],
 )
 
-packetimpact_go_test(
+packetimpact_testbench(
     name = "udp_send_recv_dgram",
     srcs = ["udp_send_recv_dgram_test.go"],
     deps = [
@@ -308,3 +318,30 @@ packetimpact_go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
+
+packetimpact_testbench(
+    name = "tcp_linger",
+    srcs = ["tcp_linger_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_testbench(
+    name = "tcp_rcv_buf_space",
+    srcs = ["tcp_rcv_buf_space_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+validate_all_tests()
+
+[packetimpact_go_test(
+    name = t.name,
+    expect_netstack_failure = hasattr(t, "expect_netstack_failure"),
+) for t in ALL_TESTS]
diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
index cf881418c..7f7a768d3 100644
--- a/test/packetimpact/tests/ipv4_id_uniqueness_test.go
+++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
@@ -88,7 +88,8 @@ func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) {
 			// this test. Once the socket option is supported, the following call
 			// can be changed to simply assert success.
 			ret, errno := dut.SetSockOptIntWithErrno(context.Background(), t, remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT)
-			if ret == -1 && errno != unix.ENOTSUP {
+			// Fuchsia will return ENOPROTOPT errno.
+			if ret == -1 && errno != unix.ENOPROTOOPT {
 				t.Fatalf("failed to set IP_MTU_DISCOVER socket option to IP_PMTUDISC_DONT: %s", errno)
 			}
 
diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go
deleted file mode 100644
index e6a96f214..000000000
--- a/test/packetimpact/tests/tcp_close_wait_ack_test.go
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcp_close_wait_ack_test
-
-import (
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
-	"gvisor.dev/gvisor/test/packetimpact/testbench"
-)
-
-func init() {
-	testbench.RegisterFlags(flag.CommandLine)
-}
-
-func TestCloseWaitAck(t *testing.T) {
-	for _, tt := range []struct {
-		description    string
-		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
-		seqNumOffset   seqnum.Size
-		expectAck      bool
-	}{
-		{"OTW", generateOTWSeqSegment, 0, false},
-		{"OTW", generateOTWSeqSegment, 1, true},
-		{"OTW", generateOTWSeqSegment, 2, true},
-		{"ACK", generateUnaccACKSegment, 0, false},
-		{"ACK", generateUnaccACKSegment, 1, true},
-		{"ACK", generateUnaccACKSegment, 2, true},
-	} {
-		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
-			dut := testbench.NewDUT(t)
-			defer dut.TearDown()
-			listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-			defer dut.Close(t, listenFd)
-			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
-			defer conn.Close(t)
-
-			conn.Connect(t)
-			acceptFd, _ := dut.Accept(t, listenFd)
-
-			// Send a FIN to DUT to intiate the active close
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
-			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
-			if err != nil {
-				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
-			}
-			windowSize := seqnum.Size(*gotTCP.WindowSize)
-
-			// Send a segment with OTW Seq / unacc ACK and expect an ACK back
-			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")})
-			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
-			if tt.expectAck && err != nil {
-				t.Fatalf("expected an ack but got none: %s", err)
-			}
-			if !tt.expectAck && gotAck != nil {
-				t.Fatalf("expected no ack but got one: %s", gotAck)
-			}
-
-			// Now let's verify DUT is indeed in CLOSE_WAIT
-			dut.Close(t, acceptFd)
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
-				t.Fatalf("expected DUT to send a FIN: %s", err)
-			}
-			// Ack the FIN from DUT
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-			// Send some extra data to DUT
-			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")})
-			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
-				t.Fatalf("expected DUT to send an RST: %s", err)
-			}
-		})
-	}
-}
-
-// generateOTWSeqSegment generates an segment with
-// seqnum = RCV.NXT + RCV.WND + seqNumOffset, the generated segment is only
-// acceptable when seqNumOffset is 0, otherwise an ACK is expected from the
-// receiver.
-func generateOTWSeqSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
-	lastAcceptable := conn.LocalSeqNum(t).Add(windowSize)
-	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
-	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
-}
-
-// generateUnaccACKSegment generates an segment with
-// acknum = SND.NXT + seqNumOffset, the generated segment is only acceptable
-// when seqNumOffset is 0, otherwise an ACK is expected from the receiver.
-func generateUnaccACKSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
-	lastAcceptable := conn.RemoteSeqNum(t)
-	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
-	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
-}
diff --git a/test/packetimpact/tests/tcp_linger_test.go b/test/packetimpact/tests/tcp_linger_test.go
new file mode 100644
index 000000000..b9a0409aa
--- /dev/null
+++ b/test/packetimpact/tests/tcp_linger_test.go
@@ -0,0 +1,270 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_linger_test
+
+import (
+	"context"
+	"flag"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func createSocket(t *testing.T, dut testbench.DUT) (int32, int32, testbench.TCPIPv4) {
+	listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	conn.Connect(t)
+	acceptFD, _ := dut.Accept(t, listenFD)
+	return acceptFD, listenFD, conn
+}
+
+func closeAll(t *testing.T, dut testbench.DUT, listenFD int32, conn testbench.TCPIPv4) {
+	conn.Close(t)
+	dut.Close(t, listenFD)
+	dut.TearDown()
+}
+
+// lingerDuration is the timeout value used with SO_LINGER socket option.
+const lingerDuration = 3 * time.Second
+
+// TestTCPLingerZeroTimeout tests when SO_LINGER is set with zero timeout. DUT
+// should send RST-ACK when socket is closed.
+func TestTCPLingerZeroTimeout(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.SetSockLingerOption(t, acceptFD, 0, true)
+	dut.Close(t, acceptFD)
+
+	// If the linger timeout is set to zero, the DUT should send a RST.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerOff tests when SO_LINGER is not set. DUT should send FIN-ACK
+// when socket is closed.
+func TestTCPLingerOff(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.Close(t, acceptFD)
+
+	// If SO_LINGER is not set, DUT should send a FIN-ACK.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerNonZeroTimeout tests when SO_LINGER is set with non-zero timeout.
+// DUT should close the socket after timeout.
+func TestTCPLingerNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"WithNonZeroLinger", true},
+		{"WithoutLinger", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
+
+// TestTCPLingerSendNonZeroTimeout tests when SO_LINGER is set with non-zero
+// timeout and send a packet. DUT should close the socket after timeout.
+func TestTCPLingerSendNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"WithSendNonZeroLinger", true},
+		{"WithoutLinger", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Send data.
+			sampleData := []byte("Sample Data")
+			dut.Send(t, acceptFD, sampleData, 0)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
+
+// TestTCPLingerShutdownZeroTimeout tests SO_LINGER with shutdown() and zero
+// timeout. DUT should send RST-ACK when socket is closed.
+func TestTCPLingerShutdownZeroTimeout(t *testing.T) {
+	// Create a socket, listen, TCP connect, and accept.
+	dut := testbench.NewDUT(t)
+	acceptFD, listenFD, conn := createSocket(t, dut)
+	defer closeAll(t, dut, listenFD, conn)
+
+	dut.SetSockLingerOption(t, acceptFD, 0, true)
+	dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
+	dut.Close(t, acceptFD)
+
+	// Shutdown will send FIN-ACK with read/write option.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+	}
+
+	// If the linger timeout is set to zero, the DUT should send a RST.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, time.Second); err != nil {
+		t.Errorf("expected RST-ACK packet within a second but got none: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
+
+// TestTCPLingerShutdownSendNonZeroTimeout tests SO_LINGER with shutdown() and
+// non-zero timeout. DUT should close the socket after timeout.
+func TestTCPLingerShutdownSendNonZeroTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		lingerOn    bool
+	}{
+		{"shutdownRDWR", true},
+		{"shutdownRDWR", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			// Create a socket, listen, TCP connect, and accept.
+			dut := testbench.NewDUT(t)
+			acceptFD, listenFD, conn := createSocket(t, dut)
+			defer closeAll(t, dut, listenFD, conn)
+
+			dut.SetSockLingerOption(t, acceptFD, lingerDuration, tt.lingerOn)
+
+			// Send data.
+			sampleData := []byte("Sample Data")
+			dut.Send(t, acceptFD, sampleData, 0)
+
+			dut.Shutdown(t, acceptFD, syscall.SHUT_RDWR)
+
+			// Increase timeout as Close will take longer time to
+			// return when SO_LINGER is set with non-zero timeout.
+			timeout := lingerDuration + 1*time.Second
+			ctx, cancel := context.WithTimeout(context.Background(), timeout)
+			defer cancel()
+			start := time.Now()
+			dut.CloseWithErrno(ctx, t, acceptFD)
+			end := time.Now()
+			diff := end.Sub(start)
+
+			if tt.lingerOn && diff < lingerDuration {
+				t.Errorf("expected close to return after %v seconds, but returned sooner", lingerDuration)
+			} else if !tt.lingerOn && diff > 1*time.Second {
+				t.Errorf("expected close to return within a second, but returned later")
+			}
+
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+			if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+				t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+			}
+
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Errorf("expected FIN-ACK packet within a second but got none: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		})
+	}
+}
+
+func TestTCPLingerNonEstablished(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	newFD := dut.Socket(t, unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_TCP)
+	dut.SetSockLingerOption(t, newFD, lingerDuration, true)
+
+	// As the socket is in the initial state, Close() should not linger
+	// and return immediately.
+	start := time.Now()
+	dut.CloseWithErrno(context.Background(), t, newFD)
+	diff := time.Since(start)
+
+	if diff > lingerDuration {
+		t.Errorf("expected close to return within %s, but returned after %s", lingerDuration, diff)
+	}
+	dut.TearDown()
+}
diff --git a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go
new file mode 100644
index 000000000..0ec8fd748
--- /dev/null
+++ b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_queue_send_in_syn_sent_test
+
+import (
+	"context"
+	"errors"
+	"flag"
+	"net"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestQueueSendInSynSent tests send behavior when the TCP state
+// is SYN-SENT.
+// It tests for 2 variants when in SYN_SENT state and:
+// (1) DUT blocks on send and complete handshake
+// (2) DUT blocks on send and receive a TCP RST.
+func TestQueueSendInSynSent(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		reset       bool
+	}{
+		{description: "Complete handshake", reset: false},
+		{description: "Send RST", reset: true},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+
+			socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4))
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			sampleData := []byte("Sample Data")
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+			dut.SetNonBlocking(t, socket, true)
+			if _, err := dut.ConnectWithErrno(context.Background(), t, socket, conn.LocalAddr(t)); !errors.Is(err, syscall.EINPROGRESS) {
+				t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err)
+			}
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil {
+				t.Fatalf("expected a SYN from DUT, but got none: %s", err)
+			}
+			if _, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0); err != syscall.Errno(unix.EWOULDBLOCK) {
+				t.Fatalf("expected error %s, got %s", syscall.Errno(unix.EWOULDBLOCK), err)
+			}
+
+			// Test blocking write.
+			dut.SetNonBlocking(t, socket, false)
+
+			var wg sync.WaitGroup
+			defer wg.Wait()
+			wg.Add(1)
+			var block sync.WaitGroup
+			block.Add(1)
+			go func() {
+				defer wg.Done()
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
+				defer cancel()
+
+				block.Done()
+				// Issue SEND call in SYN-SENT, this should be queued for
+				// process until the connection is established.
+				n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0)
+				if tt.reset {
+					if err != syscall.Errno(unix.ECONNREFUSED) {
+						t.Errorf("expected error %s, got %s", syscall.Errno(unix.ECONNREFUSED), err)
+					}
+					if n != -1 {
+						t.Errorf("expected return value %d, got %d", -1, n)
+					}
+					return
+				}
+				if n != int32(len(sampleData)) {
+					t.Errorf("failed to send on DUT: %s", err)
+				}
+			}()
+
+			// Wait for the goroutine to be scheduled and before it
+			// blocks on endpoint send.
+			block.Wait()
+			// The following sleep is used to prevent the connection
+			// from being established before we are blocked on send.
+			time.Sleep(100 * time.Millisecond)
+
+			if tt.reset {
+				conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)})
+				return
+			}
+
+			// Bring the connection to Established.
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)})
+
+			// Expect the data from the DUT's enqueued send request.
+			//
+			// On Linux, this can be piggybacked with the ACK completing the
+			// handshake. On gVisor, getting such a piggyback is a bit more
+			// complicated because the actual data enqueuing occurs in the
+			// callers of endpoint Write.
+			if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, samplePayload, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+
+			// Send sample payload and expect an ACK to ensure connection is still ESTABLISHED.
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData})
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected an ACK from DUT, but got none: %s", err)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_rcv_buf_space_test.go b/test/packetimpact/tests/tcp_rcv_buf_space_test.go
new file mode 100644
index 000000000..cfbba1e8e
--- /dev/null
+++ b/test/packetimpact/tests/tcp_rcv_buf_space_test.go
@@ -0,0 +1,80 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_rcv_buf_space_test
+
+import (
+	"context"
+	"flag"
+	"syscall"
+	"testing"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestReduceRecvBuf tests that a packet within window is still dropped
+// if the available buffer space drops below the size of the incoming
+// segment.
+func TestReduceRecvBuf(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(t, listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close(t)
+
+	conn.Connect(t)
+	acceptFd, _ := dut.Accept(t, listenFd)
+	defer dut.Close(t, acceptFd)
+
+	// Set a small receive buffer for the test.
+	const rcvBufSz = 4096
+	dut.SetSockOptInt(t, acceptFd, unix.SOL_SOCKET, unix.SO_RCVBUF, rcvBufSz)
+
+	// Retrieve the actual buffer.
+	bufSz := dut.GetSockOptInt(t, acceptFd, unix.SOL_SOCKET, unix.SO_RCVBUF)
+
+	// Generate a payload of 1 more than the actual buffer size used by the
+	// DUT.
+	sampleData := testbench.GenerateRandomPayload(t, int(bufSz)+1)
+	// Send and receive sample data to the dut.
+	const pktSize = 1400
+	for payload := sampleData; len(payload) != 0; {
+		payloadBytes := pktSize
+		if l := len(payload); l < payloadBytes {
+			payloadBytes = l
+		}
+
+		conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, []testbench.Layer{&testbench.Payload{Bytes: payload[:payloadBytes]}}...)
+		payload = payload[payloadBytes:]
+	}
+
+	// First read should read < len(sampleData)
+	if ret, _, err := dut.RecvWithErrno(context.Background(), t, acceptFd, int32(len(sampleData)), 0); ret == -1 || int(ret) == len(sampleData) {
+		t.Fatalf("dut.RecvWithErrno(ctx, t, %d, %d, 0) = %d,_, %s", acceptFd, int32(len(sampleData)), ret, err)
+	}
+
+	// Second read should return EAGAIN as the last segment should have been
+	// dropped due to it exceeding the receive buffer space available in the
+	// socket.
+	if ret, got, err := dut.RecvWithErrno(context.Background(), t, acceptFd, int32(len(sampleData)), syscall.MSG_DONTWAIT); got != nil || ret != -1 || err != syscall.EAGAIN {
+		t.Fatalf("expected no packets but got: %s", got)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_timewait_reset_test.go b/test/packetimpact/tests/tcp_timewait_reset_test.go
new file mode 100644
index 000000000..2f76a6531
--- /dev/null
+++ b/test/packetimpact/tests/tcp_timewait_reset_test.go
@@ -0,0 +1,68 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_timewait_reset_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTimeWaitReset tests handling of RST when in TIME_WAIT state.
+func TestTimeWaitReset(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+	defer dut.Close(t, listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close(t)
+
+	conn.Connect(t)
+	acceptFD, _ := dut.Accept(t, listenFD)
+
+	// Trigger active close.
+	dut.Close(t, acceptFD)
+
+	_, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("expected a FIN: %s", err)
+	}
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	// Send a FIN, DUT should transition to TIME_WAIT from FIN_WAIT2.
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)})
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+		t.Fatalf("expected an ACK for our FIN: %s", err)
+	}
+
+	// Send a RST, the DUT should transition to CLOSED from TIME_WAIT.
+	// This is the default Linux behavior, it can be changed to ignore RSTs via
+	// sysctl net.ipv4.tcp_rfc1337.
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)})
+
+	conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	// The DUT should reply with RST to our ACK as the state should have
+	// transitioned to CLOSED.
+	if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+		t.Fatalf("expected a RST: %s", err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_unacc_seq_ack_test.go b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go
new file mode 100644
index 000000000..d078bbf15
--- /dev/null
+++ b/test/packetimpact/tests/tcp_unacc_seq_ack_test.go
@@ -0,0 +1,234 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_unacc_seq_ack_test
+
+import (
+	"flag"
+	"fmt"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestEstablishedUnaccSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+		restoreSeq     bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, expectAck: true, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, expectAck: true, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, expectAck: true, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, expectAck: true, restoreSeq: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, expectAck: false, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, expectAck: false, restoreSeq: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			dut.Accept(t, listenFD)
+
+			sampleData := []byte("Sample Data")
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected ack %s", err)
+			}
+			windowSize := seqnum.Size(*gotTCP.WindowSize)
+
+			origSeq := *conn.LocalSeqNum(t)
+			// Send a segment with OTW Seq / unacc ACK.
+			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), samplePayload)
+			if tt.restoreSeq {
+				// Restore the local sequence number to ensure that the incoming
+				// ACK matches the TCP layer state.
+				*conn.LocalSeqNum(t) = origSeq
+			}
+			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Fatalf("expected an ack but got none: %s", err)
+			}
+			if err == nil && !tt.expectAck && gotAck != nil {
+				t.Fatalf("expected no ack but got one: %s", gotAck)
+			}
+		})
+	}
+}
+
+func TestPassiveCloseUnaccSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, expectAck: false},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, expectAck: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, expectAck: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, expectAck: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, expectAck: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, expectAck: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFD, _ := dut.Accept(t, listenFD)
+
+			// Send a FIN to DUT to intiate the passive close.
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
+			}
+			windowSize := seqnum.Size(*gotTCP.WindowSize)
+
+			sampleData := []byte("Sample Data")
+			samplePayload := &testbench.Payload{Bytes: sampleData}
+
+			// Send a segment with OTW Seq / unacc ACK.
+			conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, windowSize), samplePayload)
+			gotAck, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Errorf("expected an ack but got none: %s", err)
+			}
+			if err == nil && !tt.expectAck && gotAck != nil {
+				t.Errorf("expected no ack but got one: %s", gotAck)
+			}
+
+			// Now let's verify DUT is indeed in CLOSE_WAIT
+			dut.Close(t, acceptFD)
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send a FIN: %s", err)
+			}
+			// Ack the FIN from DUT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+			// Send some extra data to DUT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, samplePayload)
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send an RST: %s", err)
+			}
+		})
+	}
+}
+
+func TestActiveCloseUnaccpSeqAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		restoreSeq     bool
+	}{
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 0, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 1, restoreSeq: true},
+		{description: "OTWSeq", makeTestingTCP: generateOTWSeqSegment, seqNumOffset: 2, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 0, restoreSeq: false},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 1, restoreSeq: true},
+		{description: "UnaccAck", makeTestingTCP: generateUnaccACKSegment, seqNumOffset: 2, restoreSeq: true},
+	} {
+		t.Run(fmt.Sprintf("%s:offset=%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+			defer dut.Close(t, listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFD, _ := dut.Accept(t, listenFD)
+
+			// Trigger active close.
+			dut.Shutdown(t, acceptFD, syscall.SHUT_WR)
+
+			// Get to FIN_WAIT2
+			gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected a FIN: %s", err)
+			}
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+			sendUnaccSeqAck := func(state string) {
+				t.Helper()
+				sampleData := []byte("Sample Data")
+				samplePayload := &testbench.Payload{Bytes: sampleData}
+
+				origSeq := *conn.LocalSeqNum(t)
+				// Send a segment with OTW Seq / unacc ACK.
+				conn.Send(t, tt.makeTestingTCP(t, &conn, tt.seqNumOffset, seqnum.Size(*gotTCP.WindowSize)), samplePayload)
+				if tt.restoreSeq {
+					// Restore the local sequence number to ensure that the
+					// incoming ACK matches the TCP layer state.
+					*conn.LocalSeqNum(t) = origSeq
+				}
+				if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+					t.Errorf("expected an ack in %s state, but got none: %s", state, err)
+				}
+			}
+
+			sendUnaccSeqAck("FIN_WAIT2")
+
+			// Send a FIN to DUT to get to TIME_WAIT
+			conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)})
+			if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter TIME_WAIT: %s", err)
+			}
+
+			sendUnaccSeqAck("TIME_WAIT")
+		})
+	}
+}
+
+// generateOTWSeqSegment generates an segment with
+// seqnum = RCV.NXT + RCV.WND + seqNumOffset, the generated segment is only
+// acceptable when seqNumOffset is 0, otherwise an ACK is expected from the
+// receiver.
+func generateOTWSeqSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.LocalSeqNum(t).Add(windowSize)
+	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
+
+// generateUnaccACKSegment generates an segment with
+// acknum = SND.NXT + seqNumOffset, the generated segment is only acceptable
+// when seqNumOffset is 0, otherwise an ACK is expected from the receiver.
+func generateUnaccACKSegment(t *testing.T, conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.RemoteSeqNum(t)
+	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
diff --git a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go
index d30177e64..3d2791a6e 100644
--- a/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go
+++ b/test/packetimpact/tests/udp_discard_mcast_source_addr_test.go
@@ -53,6 +53,7 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV4(t *testing.T) {
 				t,
 				testbench.IPv4{SrcAddr: testbench.Address(tcpip.Address(mcastAddr.To4()))},
 				testbench.UDP{},
+				&testbench.Payload{Bytes: []byte("test payload")},
 			)
 
 			ret, payload, errno := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0)
@@ -76,14 +77,15 @@ func TestDiscardsUDPPacketsWithMcastSourceAddressV6(t *testing.T) {
 		net.IPv6interfacelocalallnodes,
 		net.IPv6linklocalallnodes,
 		net.IPv6linklocalallrouters,
-		net.ParseIP("fe01::42"),
-		net.ParseIP("fe02::4242"),
+		net.ParseIP("ff01::42"),
+		net.ParseIP("ff02::4242"),
 	} {
 		t.Run(fmt.Sprintf("srcaddr=%s", mcastAddr), func(t *testing.T) {
 			conn.SendIPv6(
 				t,
 				testbench.IPv6{SrcAddr: testbench.Address(tcpip.Address(mcastAddr.To16()))},
 				testbench.UDP{},
+				&testbench.Payload{Bytes: []byte("test payload")},
 			)
 			ret, payload, errno := dut.RecvWithErrno(context.Background(), t, remoteFD, 100, 0)
 			if errno != syscall.EAGAIN || errno != syscall.EWOULDBLOCK {
diff --git a/test/perf/BUILD b/test/perf/BUILD
index 471d8c2ab..b763be50e 100644
--- a/test/perf/BUILD
+++ b/test/perf/BUILD
@@ -3,33 +3,40 @@ load("//test/runner:defs.bzl", "syscall_test")
 package(licenses = ["notice"])
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:clock_getres_benchmark",
 )
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:clock_gettime_benchmark",
 )
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:death_benchmark",
 )
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:epoll_benchmark",
 )
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:fork_benchmark",
 )
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:futex_benchmark",
 )
 
 syscall_test(
     size = "enormous",
+    debug = False,
     shard_count = 10,
     tags = ["nogotsan"],
     test = "//test/perf/linux:getdents_benchmark",
@@ -37,81 +44,96 @@ syscall_test(
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:getpid_benchmark",
 )
 
 syscall_test(
     size = "enormous",
+    debug = False,
     tags = ["nogotsan"],
     test = "//test/perf/linux:gettid_benchmark",
 )
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:mapping_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:open_benchmark",
 )
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:pipe_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:randread_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:read_benchmark",
 )
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:sched_yield_benchmark",
 )
 
 syscall_test(
     size = "large",
+    debug = False,
     test = "//test/perf/linux:send_recv_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:seqwrite_benchmark",
 )
 
 syscall_test(
     size = "enormous",
+    debug = False,
     test = "//test/perf/linux:signal_benchmark",
 )
 
 syscall_test(
+    debug = False,
     test = "//test/perf/linux:sleep_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:stat_benchmark",
 )
 
 syscall_test(
     size = "enormous",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:unlink_benchmark",
 )
 
 syscall_test(
     size = "large",
     add_overlay = True,
+    debug = False,
     test = "//test/perf/linux:write_benchmark",
 )
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
index b4e907826..dd1d2438c 100644
--- a/test/perf/linux/BUILD
+++ b/test/perf/linux/BUILD
@@ -354,3 +354,19 @@ cc_binary(
         "//test/util:test_util",
     ],
 )
+
+cc_binary(
+    name = "open_read_close_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_read_close_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
index d8e81fa8c..9030eb356 100644
--- a/test/perf/linux/getdents_benchmark.cc
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -105,7 +105,7 @@ void BM_GetdentsSameFD(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations());
 }
 
-BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 16)->UseRealTime();
+BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 12)->UseRealTime();
 
 // Creates a directory containing `files` files, and reads all the directory
 // entries from the directory using a new FD each time.
diff --git a/test/perf/linux/open_read_close_benchmark.cc b/test/perf/linux/open_read_close_benchmark.cc
new file mode 100644
index 000000000..8b023a3d8
--- /dev/null
+++ b/test/perf/linux/open_read_close_benchmark.cc
@@ -0,0 +1,61 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_OpenReadClose(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+        GetAbsoluteTestTmpdir(), "some content", 0644));
+    cache.emplace_back(std::move(path));
+  }
+
+  char buf[1];
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    TEST_CHECK(read(fd, buf, 1) == 1);
+    close(fd);
+  }
+}
+
+// Gofer dentry cache is 1000 by default. Go over it to force files to be closed
+// for real.
+BENCHMARK(BM_OpenReadClose)->Range(1000, 16384)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index df91fa0fe..11ac5cb52 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -418,7 +418,7 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) {
 		// care about the docker runtime name.
 		config = v2Template
 	default:
-		t.Fatalf("unknown version: %d", version)
+		t.Fatalf("unknown version: %s", version)
 	}
 	t.Logf("Using config: %s", config)
 
diff --git a/test/root/root.go b/test/root/root.go
index 0f1d29faf..441fa5e2e 100644
--- a/test/root/root.go
+++ b/test/root/root.go
@@ -17,5 +17,5 @@
 // docker, containerd, and crictl installed. To run these tests from the
 // project root directory:
 //
-//     ./scripts/root_tests.sh
+//     make root-tests
 package root
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
index 2d64934b0..9b5994d59 100644
--- a/test/runner/defs.bzl
+++ b/test/runner/defs.bzl
@@ -57,6 +57,7 @@ def _syscall_test(
         platform,
         use_tmpfs,
         tags,
+        debug,
         network = "none",
         file_access = "exclusive",
         overlay = False,
@@ -101,6 +102,10 @@ def _syscall_test(
     # Disable off-host networking.
     tags.append("requires-net:loopback")
 
+    # gotsan makes sense only if tests are running in gVisor.
+    if platform == "native":
+        tags.append("nogotsan")
+
     runner_args = [
         # Arguments are passed directly to runner binary.
         "--platform=" + platform,
@@ -111,6 +116,8 @@ def _syscall_test(
         "--add-uds-tree=" + str(add_uds_tree),
         "--vfs2=" + str(vfs2),
         "--fuse=" + str(fuse),
+        "--strace=" + str(debug),
+        "--debug=" + str(debug),
     ]
 
     # Call the rule above.
@@ -134,6 +141,7 @@ def syscall_test(
         add_hostinet = False,
         vfs2 = True,
         fuse = False,
+        debug = True,
         tags = None):
     """syscall_test is a macro that will create targets for all platforms.
 
@@ -171,6 +179,7 @@ def syscall_test(
         use_tmpfs = use_tmpfs,
         add_uds_tree = add_uds_tree,
         tags = platforms[default_platform] + vfs2_tags,
+        debug = debug,
         vfs2 = True,
         fuse = fuse,
     )
@@ -186,6 +195,7 @@ def syscall_test(
         use_tmpfs = False,
         add_uds_tree = add_uds_tree,
         tags = list(tags),
+        debug = debug,
     )
 
     for (platform, platform_tags) in platforms.items():
@@ -197,9 +207,9 @@ def syscall_test(
             use_tmpfs = use_tmpfs,
             add_uds_tree = add_uds_tree,
             tags = platform_tags + tags,
+            debug = debug,
         )
 
-    # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests.
     if add_overlay:
         _syscall_test(
             test = test,
@@ -209,7 +219,26 @@ def syscall_test(
             use_tmpfs = use_tmpfs,
             add_uds_tree = add_uds_tree,
             tags = platforms[default_platform] + tags,
+            debug = debug,
+            overlay = True,
+        )
+
+        # TODO(gvisor.dev/issue/4407): Remove tags to enable VFS2 overlay tests.
+        overlay_vfs2_tags = list(vfs2_tags)
+        overlay_vfs2_tags.append("manual")
+        overlay_vfs2_tags.append("noguitar")
+        overlay_vfs2_tags.append("notap")
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + overlay_vfs2_tags,
+            debug = debug,
             overlay = True,
+            vfs2 = True,
         )
 
     if add_hostinet:
@@ -222,6 +251,7 @@ def syscall_test(
             network = "host",
             add_uds_tree = add_uds_tree,
             tags = platforms[default_platform] + tags,
+            debug = debug,
         )
 
     if not use_tmpfs:
@@ -234,6 +264,7 @@ def syscall_test(
             use_tmpfs = use_tmpfs,
             add_uds_tree = add_uds_tree,
             tags = platforms[default_platform] + tags,
+            debug = debug,
             file_access = "shared",
         )
         _syscall_test(
@@ -244,6 +275,7 @@ def syscall_test(
             use_tmpfs = use_tmpfs,
             add_uds_tree = add_uds_tree,
             tags = platforms[default_platform] + vfs2_tags,
+            debug = debug,
             file_access = "shared",
             vfs2 = True,
         )
diff --git a/test/runner/runner.go b/test/runner/runner.go
index 5ac91310d..7ab2c3edf 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -53,6 +53,9 @@ var (
 	runscPath  = flag.String("runsc", "", "path to runsc binary")
 
 	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
+	// TODO(gvisor.dev/issue/4572): properly support leak checking for runsc, and
+	// set to true as the default for the test runner.
+	leakCheck = flag.Bool("leak-check", false, "check for reference leaks")
 )
 
 // runTestCaseNative runs the test case directly on the host machine.
@@ -106,11 +109,14 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 	cmd.Env = env
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+	cmd.SysProcAttr = &syscall.SysProcAttr{}
+
+	if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) {
+		cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWUTS
+	}
 
 	if specutils.HasCapabilities(capability.CAP_NET_ADMIN) {
-		cmd.SysProcAttr = &syscall.SysProcAttr{
-			Cloneflags: syscall.CLONE_NEWNET,
-		}
+		cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWNET
 	}
 
 	if err := cmd.Run(); err != nil {
@@ -171,6 +177,9 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 	if *addUDSTree {
 		args = append(args, "-fsgofer-host-uds")
 	}
+	if *leakCheck {
+		args = append(args, "-ref-leak-mode=log-names")
+	}
 
 	testLogDir := ""
 	if undeclaredOutputsDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 066338ee3..22b526f59 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -5,7 +5,7 @@ package(licenses = ["notice"])
 
 runtime_test(
     name = "go1.12",
-    exclude_file = "exclude_go1.12.csv",
+    exclude_file = "exclude/go1.12.csv",
     lang = "go",
     shard_count = 8,
 )
@@ -13,28 +13,28 @@ runtime_test(
 runtime_test(
     name = "java11",
     batch = 100,
-    exclude_file = "exclude_java11.csv",
+    exclude_file = "exclude/java11.csv",
     lang = "java",
     shard_count = 16,
 )
 
 runtime_test(
     name = "nodejs12.4.0",
-    exclude_file = "exclude_nodejs12.4.0.csv",
+    exclude_file = "exclude/nodejs12.4.0.csv",
     lang = "nodejs",
     shard_count = 8,
 )
 
 runtime_test(
     name = "php7.3.6",
-    exclude_file = "exclude_php7.3.6.csv",
+    exclude_file = "exclude/php7.3.6.csv",
     lang = "php",
     shard_count = 8,
 )
 
 runtime_test(
     name = "python3.7.3",
-    exclude_file = "exclude_python3.7.3.csv",
+    exclude_file = "exclude/python3.7.3.csv",
     lang = "python",
     shard_count = 8,
 )
diff --git a/test/runtimes/README.md b/test/runtimes/README.md
new file mode 100644
index 000000000..9dda1a728
--- /dev/null
+++ b/test/runtimes/README.md
@@ -0,0 +1,62 @@
+# gVisor Runtime Tests
+
+App Engine uses gvisor to sandbox application containers. The runtime tests aim
+to test `runsc` compatibility with these
+[standard runtimes](https://cloud.google.com/appengine/docs/standard/runtimes).
+The test itself runs the language-defined tests inside the sandboxed standard
+runtime container.
+
+Note: [Ruby runtime](https://cloud.google.com/appengine/docs/standard/ruby) is
+currently in beta mode and so we do not run tests for it yet.
+
+### Testing Locally
+
+To run runtime tests individually from a given runtime, use the following table.
+
+Language | Version | Download Image                              | Run Test(s)
+-------- | ------- | ------------------------------------------- | -----------
+Go       | 1.12    | `make -C images load-runtimes_go1.12`       | If the test name ends with `.go`, it is an on-disk test: <br> `docker run --runtime=runsc -it gvisor.dev/images/runtimes/go1.12 ( cd /usr/local/go/test ; go run run.go -v -- <TEST_NAME>... )` <br> Otherwise it is a tool test: <br> `docker run --runtime=runsc -it gvisor.dev/images/runtimes/go1.12 go tool dist test -v -no-rebuild ^TEST1$\|^TEST2$...`
+Java     | 11      | `make -C images load-runtimes_java11`       | `docker run --runtime=runsc -it gvisor.dev/images/runtimes/java11 jtreg -agentvm -dir:/root/test/jdk -noreport -timeoutFactor:20 -verbose:summary <TEST_NAME>...`
+NodeJS   | 12.4.0  | `make -C images load-runtimes_nodejs12.4.0` | `docker run --runtime=runsc -it gvisor.dev/images/runtimes/nodejs12.4.0 python tools/test.py --timeout=180 <TEST_NAME>...`
+Php      | 7.3.6   | `make -C images load-runtimes_php7.3.6`     | `docker run --runtime=runsc -it gvisor.dev/images/runtimes/php7.3.6 make test "TESTS=<TEST_NAME>..."`
+Python   | 3.7.3   | `make -C images load-runtimes_python3.7.3`  | `docker run --runtime=runsc -it gvisor.dev/images/runtimes/python3.7.3 ./python -m test <TEST_NAME>...`
+
+To run an entire runtime test locally, use the following table.
+
+Note: java runtime test take 1+ hours with 16 cores.
+
+Language | Version | Running the test suite
+-------- | ------- | ----------------------------------------
+Go       | 1.12    | `make go1.12-runtime-tests{_vfs2}`
+Java     | 11      | `make java11-runtime-tests{_vfs2}`
+NodeJS   | 12.4.0  | `make nodejs12.4.0-runtime-tests{_vfs2}`
+Php      | 7.3.6   | `make php7.3.6-runtime-tests{_vfs2}`
+Python   | 3.7.3   | `make python3.7.3-runtime-tests{_vfs2}`
+
+#### Clean Up
+
+Sometimes when runtime tests fail or when the testing container itself crashes
+unexpectedly, the containers are not removed or sometimes do not even exit. This
+can cause some docker commands like `docker system prune` to hang forever.
+
+Here are some helpful commands (should be executed in order):
+
+```bash
+docker ps -a  # Lists all docker processes; useful when investigating hanging containers.
+docker kill $(docker ps -a -q)  # Kills all running containers.
+docker rm $(docker ps -a -q)  # Removes all exited containers.
+docker system prune  # Remove unused data.
+```
+
+### Testing Infrastructure
+
+There are 3 components to this tests infrastructure:
+
+-   [`runner`](runner) - This is the test entrypoint. This is the binary is
+    invoked by `bazel test`. The runner spawns the target runtime container
+    using `runsc` and then copies over the `proctor` binary into the container.
+-   [`proctor`](proctor) - This binary acts as our agent inside the container
+    which communicates with the runner and actually executes tests.
+-   [`exclude`](exclude) - Holds a CSV file for each language runtime containing
+    the full path of tests that should be excluded from running along with a
+    reason for exclusion.
diff --git a/test/runtimes/exclude_go1.12.csv b/test/runtimes/exclude/go1.12.csv
index 81e02cf64..81e02cf64 100644
--- a/test/runtimes/exclude_go1.12.csv
+++ b/test/runtimes/exclude/go1.12.csv
diff --git a/test/runtimes/exclude_java11.csv b/test/runtimes/exclude/java11.csv
index 997a29cad..e41441374 100644
--- a/test/runtimes/exclude_java11.csv
+++ b/test/runtimes/exclude/java11.csv
@@ -1,9 +1,11 @@
 test name,bug id,comment
 com/sun/crypto/provider/Cipher/PBE/PKCS12Cipher.java,,Fails in Docker
+com/sun/jdi/InvokeHangTest.java,https://bugs.openjdk.java.net/browse/JDK-8218463,
 com/sun/jdi/NashornPopFrameTest.java,,
 com/sun/jdi/ProcessAttachTest.java,,
 com/sun/management/HotSpotDiagnosticMXBean/CheckOrigin.java,,Fails in Docker
 com/sun/management/OperatingSystemMXBean/GetCommittedVirtualMemorySize.java,,
+com/sun/management/ThreadMXBean/ThreadCpuTimeArray.java,,Test assumes high CPU clock precision
 com/sun/management/UnixOperatingSystemMXBean/GetMaxFileDescriptorCount.sh,,
 com/sun/tools/attach/AttachSelf.java,,
 com/sun/tools/attach/BasicTests.java,,
@@ -55,6 +57,7 @@ java/nio/channels/SocketChannel/SocketOptionTests.java,b/77965901,
 java/nio/channels/spi/SelectorProvider/inheritedChannel/InheritedChannelTest.java,,Fails in Docker
 java/rmi/activation/Activatable/extLoadedImpl/ext.sh,,
 java/rmi/transport/checkLeaseInfoLeak/CheckLeaseLeak.java,,
+java/security/cert/PolicyNode/GetPolicyQualifiers.java,b/170263154,Kokoro executor cert expired
 java/text/Format/NumberFormat/CurrencyFormat.java,,Fails in Docker
 java/text/Format/NumberFormat/CurrencyFormat.java,,Fails in Docker
 java/util/Calendar/JapaneseEraNameTest.java,,
@@ -141,6 +144,7 @@ jdk/jfr/cmd/TestSplit.java,,java.lang.RuntimeException: 'Missing file' missing f
 jdk/jfr/cmd/TestSummary.java,,java.lang.RuntimeException: 'Missing file' missing from stdout/stderr
 jdk/jfr/event/compiler/TestCompilerStats.java,,java.lang.RuntimeException: Field nmetodsSize not in event
 jdk/jfr/event/metadata/TestDefaultConfigurations.java,,Setting 'threshold' in event 'jdk.SecurityPropertyModification' was not configured in the configuration 'default'
+jdk/jfr/event/oldobject/TestLargeRootSet.java,,Flaky - `main' threw exception: java.lang.RuntimeException: Could not find root object
 jdk/jfr/event/runtime/TestActiveSettingEvent.java,,java.lang.Exception: Could not find setting with name jdk.X509Validation#threshold
 jdk/jfr/event/runtime/TestModuleEvents.java,,java.lang.RuntimeException: assertEquals: expected jdk.proxy1 to equal java.base
 jdk/jfr/event/runtime/TestNetworkUtilizationEvent.java,,
diff --git a/test/runtimes/exclude_nodejs12.4.0.csv b/test/runtimes/exclude/nodejs12.4.0.csv
index 1d8e65fd0..c4e7917ec 100644
--- a/test/runtimes/exclude_nodejs12.4.0.csv
+++ b/test/runtimes/exclude/nodejs12.4.0.csv
@@ -1,29 +1,22 @@
 test name,bug id,comment
-benchmark/test-benchmark-fs.js,,
-benchmark/test-benchmark-napi.js,,
+async-hooks/test-statwatcher.js,https://github.com/nodejs/node/issues/21425,Check for fix inclusion in nodejs releases after 2020-03-29
+benchmark/test-benchmark-fs.js,,Broken test
+benchmark/test-benchmark-napi.js,,Broken test
 doctool/test-make-doc.js,b/68848110,Expected to fail.
 internet/test-dgram-multicast-set-interface-lo.js,b/162798882,
-internet/test-doctool-versions.js,,
-internet/test-uv-threadpool-schedule.js,,
-parallel/test-cluster-dgram-reuse.js,b/64024294,
+internet/test-doctool-versions.js,,Broken test
+internet/test-uv-threadpool-schedule.js,,Broken test
 parallel/test-dgram-bind-fd.js,b/132447356,
 parallel/test-dgram-socket-buffer-size.js,b/68847921,
 parallel/test-dns-channel-timeout.js,b/161893056,
-parallel/test-fs-access.js,,
-parallel/test-fs-watchfile.js,,Flaky - File already exists error
-parallel/test-fs-write-stream.js,,Flaky
-parallel/test-fs-write-stream-throw-type-error.js,b/110226209,
-parallel/test-http-writable-true-after-close.js,,Flaky - Mismatched <anonymous> function calls. Expected exactly 1 actual 2
+parallel/test-fs-access.js,,Broken test
+parallel/test-fs-watchfile.js,b/166819807,Flaky - VFS1 only
+parallel/test-fs-write-stream.js,b/166819807,Flaky - VFS1 only
+parallel/test-fs-write-stream-double-close.js,b/166819807,Flaky - VFS1 only
+parallel/test-fs-write-stream-throw-type-error.js,b/166819807,Flaky - VFS1 only
+parallel/test-http-writable-true-after-close.js,b/171301436,Flaky - Mismatched <anonymous> function calls. Expected exactly 1 actual 2
 parallel/test-os.js,b/63997097,
-parallel/test-net-server-listen-options.js,,Flaky - EADDRINUSE
-parallel/test-process-uid-gid.js,,
-parallel/test-tls-cli-min-version-1.0.js,,Flaky - EADDRINUSE
-parallel/test-tls-cli-min-version-1.1.js,,Flaky - EADDRINUSE
-parallel/test-tls-cli-min-version-1.2.js,,Flaky - EADDRINUSE
-parallel/test-tls-cli-min-version-1.3.js,,Flaky - EADDRINUSE
-parallel/test-tls-cli-max-version-1.2.js,,Flaky - EADDRINUSE
-parallel/test-tls-cli-max-version-1.3.js,,Flaky - EADDRINUSE
-parallel/test-tls-min-max-version.js,,Flaky - EADDRINUSE
+parallel/test-process-uid-gid.js,,Does not work inside Docker with gid nobody
 pseudo-tty/test-assert-colors.js,b/162801321,
 pseudo-tty/test-assert-no-color.js,b/162801321,
 pseudo-tty/test-assert-position-indicator.js,b/162801321,
@@ -46,10 +39,7 @@ pseudo-tty/test-tty-stdout-resize.js,b/162801321,
 pseudo-tty/test-tty-stream-constructors.js,b/162801321,
 pseudo-tty/test-tty-window-size.js,b/162801321,
 pseudo-tty/test-tty-wrap.js,b/162801321,
-pummel/test-heapdump-http2.js,,Flaky
-pummel/test-net-pingpong.js,,
+pummel/test-net-pingpong.js,,Broken test
 pummel/test-vm-memleak.js,b/162799436,
-sequential/test-child-process-pass-fd.js,b/63926391,Flaky
-sequential/test-https-connect-localport.js,,Flaky - EADDRINUSE
-sequential/test-net-bytes-per-incoming-chunk-overhead.js,,flaky - timeout
-tick-processor/test-tick-processor-builtin.js,,
+pummel/test-watch-file.js,,Flaky - VFS1 only
+tick-processor/test-tick-processor-builtin.js,,Broken test
diff --git a/test/runtimes/exclude_php7.3.6.csv b/test/runtimes/exclude/php7.3.6.csv
index 2ce979dc8..9e1f4c050 100644
--- a/test/runtimes/exclude_php7.3.6.csv
+++ b/test/runtimes/exclude/php7.3.6.csv
@@ -13,18 +13,27 @@ ext/session/tests/session_set_save_handler_class_018.phpt,,
 ext/session/tests/session_set_save_handler_iface_003.phpt,,
 ext/session/tests/session_set_save_handler_sid_001.phpt,,
 ext/session/tests/session_set_save_handler_variation4.phpt,,
+ext/standard/tests/file/disk.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_free_space_basic.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_free_space_error.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_free_space_variation.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_total_space_basic.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_total_space_error.phpt,https://bugs.php.net/bug.php?id=80018,
+ext/standard/tests/file/disk_total_space_variation.phpt,https://bugs.php.net/bug.php?id=80018,
 ext/standard/tests/file/fopen_variation19.phpt,b/162894964,
 ext/standard/tests/file/lstat_stat_variation14.phpt,,Flaky
 ext/standard/tests/file/php_fd_wrapper_01.phpt,,
 ext/standard/tests/file/php_fd_wrapper_02.phpt,,
 ext/standard/tests/file/php_fd_wrapper_03.phpt,,
 ext/standard/tests/file/php_fd_wrapper_04.phpt,,
-ext/standard/tests/file/realpath_bug77484.phpt,b/162894969,
+ext/standard/tests/file/realpath_bug77484.phpt,b/162894969,VFS1 only failure
 ext/standard/tests/file/rename_variation.phpt,b/68717309,
 ext/standard/tests/file/symlink_link_linkinfo_is_link_variation4.phpt,b/162895341,
 ext/standard/tests/file/symlink_link_linkinfo_is_link_variation8.phpt,b/162896223,
 ext/standard/tests/general_functions/escapeshellarg_bug71270.phpt,,
 ext/standard/tests/general_functions/escapeshellcmd_bug71270.phpt,,
+ext/standard/tests/network/bug20134.phpt,b/171347929,Flaky
+ext/standard/tests/streams/proc_open_bug60120.phpt,,Flaky until php-src 3852a35fdbcb
 ext/standard/tests/streams/proc_open_bug69900.phpt,,Flaky
 ext/standard/tests/streams/stream_socket_sendto.phpt,,
 ext/standard/tests/strings/007.phpt,,
diff --git a/test/runtimes/exclude_python3.7.3.csv b/test/runtimes/exclude/python3.7.3.csv
index 8760f8951..911f22855 100644
--- a/test/runtimes/exclude_python3.7.3.csv
+++ b/test/runtimes/exclude/python3.7.3.csv
@@ -18,4 +18,3 @@ test_selectors,b/76116849,OSError not raised with epoll
 test_smtplib,b/162980434,unclosed sockets
 test_signal,,Flaky - signal: alarm clock
 test_socket,b/75983380,
-test_subprocess,b/162980831,
diff --git a/test/runtimes/proctor/BUILD b/test/runtimes/proctor/BUILD
index f76e2ddc0..fdc6d3173 100644
--- a/test/runtimes/proctor/BUILD
+++ b/test/runtimes/proctor/BUILD
@@ -1,28 +1,11 @@
-load("//tools:defs.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
 go_binary(
     name = "proctor",
-    srcs = [
-        "go.go",
-        "java.go",
-        "nodejs.go",
-        "php.go",
-        "proctor.go",
-        "python.go",
-    ],
+    srcs = ["main.go"],
     pure = True,
     visibility = ["//test/runtimes:__pkg__"],
-)
-
-go_test(
-    name = "proctor_test",
-    size = "small",
-    srcs = ["proctor_test.go"],
-    library = ":proctor",
-    pure = True,
-    deps = [
-        "//pkg/test/testutil",
-    ],
+    deps = ["//test/runtimes/proctor/lib"],
 )
diff --git a/test/runtimes/proctor/lib/BUILD b/test/runtimes/proctor/lib/BUILD
new file mode 100644
index 000000000..0c8367dfe
--- /dev/null
+++ b/test/runtimes/proctor/lib/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "lib",
+    srcs = [
+        "go.go",
+        "java.go",
+        "lib.go",
+        "nodejs.go",
+        "php.go",
+        "python.go",
+    ],
+    visibility = ["//test/runtimes/proctor:__pkg__"],
+)
+
+go_test(
+    name = "lib_test",
+    size = "small",
+    srcs = ["lib_test.go"],
+    library = ":lib",
+    deps = ["//pkg/test/testutil"],
+)
diff --git a/test/runtimes/proctor/go.go b/test/runtimes/proctor/lib/go.go
index d0ae844e6..5c48fb60b 100644
--- a/test/runtimes/proctor/go.go
+++ b/test/runtimes/proctor/lib/go.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"fmt"
@@ -59,7 +59,7 @@ func (goRunner) ListTests() ([]string, error) {
 	}
 
 	// Go tests on disk.
-	diskSlice, err := search(goTestDir, goTestRegEx)
+	diskSlice, err := Search(goTestDir, goTestRegEx)
 	if err != nil {
 		return nil, err
 	}
diff --git a/test/runtimes/proctor/java.go b/test/runtimes/proctor/lib/java.go
index d456fa681..3105011ff 100644
--- a/test/runtimes/proctor/java.go
+++ b/test/runtimes/proctor/lib/java.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"fmt"
diff --git a/test/runtimes/proctor/proctor.go b/test/runtimes/proctor/lib/lib.go
index 9e0642424..f2ba82498 100644
--- a/test/runtimes/proctor/proctor.go
+++ b/test/runtimes/proctor/lib/lib.go
@@ -12,20 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Binary proctor runs the test for a particular runtime. It is meant to be
-// included in Docker images for all runtime tests.
-package main
+// Package lib contains proctor functions.
+package lib
 
 import (
-	"flag"
 	"fmt"
-	"log"
 	"os"
 	"os/exec"
 	"os/signal"
 	"path/filepath"
 	"regexp"
-	"strings"
 	"syscall"
 )
 
@@ -42,66 +38,8 @@ type TestRunner interface {
 	TestCmds(tests []string) []*exec.Cmd
 }
 
-var (
-	runtime   = flag.String("runtime", "", "name of runtime")
-	list      = flag.Bool("list", false, "list all available tests")
-	testNames = flag.String("tests", "", "run a subset of the available tests")
-	pause     = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
-)
-
-func main() {
-	flag.Parse()
-
-	if *pause {
-		pauseAndReap()
-		panic("pauseAndReap should never return")
-	}
-
-	if *runtime == "" {
-		log.Fatalf("runtime flag must be provided")
-	}
-
-	tr, err := testRunnerForRuntime(*runtime)
-	if err != nil {
-		log.Fatalf("%v", err)
-	}
-
-	// List tests.
-	if *list {
-		tests, err := tr.ListTests()
-		if err != nil {
-			log.Fatalf("failed to list tests: %v", err)
-		}
-		for _, test := range tests {
-			fmt.Println(test)
-		}
-		return
-	}
-
-	var tests []string
-	if *testNames == "" {
-		// Run every test.
-		tests, err = tr.ListTests()
-		if err != nil {
-			log.Fatalf("failed to get all tests: %v", err)
-		}
-	} else {
-		// Run subset of test.
-		tests = strings.Split(*testNames, ",")
-	}
-
-	// Run tests.
-	cmds := tr.TestCmds(tests)
-	for _, cmd := range cmds {
-		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
-		if err := cmd.Run(); err != nil {
-			log.Fatalf("FAIL: %v", err)
-		}
-	}
-}
-
-// testRunnerForRuntime returns a new TestRunner for the given runtime.
-func testRunnerForRuntime(runtime string) (TestRunner, error) {
+// TestRunnerForRuntime returns a new TestRunner for the given runtime.
+func TestRunnerForRuntime(runtime string) (TestRunner, error) {
 	switch runtime {
 	case "go":
 		return goRunner{}, nil
@@ -117,8 +55,8 @@ func testRunnerForRuntime(runtime string) (TestRunner, error) {
 	return nil, fmt.Errorf("invalid runtime %q", runtime)
 }
 
-// pauseAndReap is like init. It runs forever and reaps any children.
-func pauseAndReap() {
+// PauseAndReap is like init. It runs forever and reaps any children.
+func PauseAndReap() {
 	// Get notified of any new children.
 	ch := make(chan os.Signal, 1)
 	signal.Notify(ch, syscall.SIGCHLD)
@@ -138,9 +76,9 @@ func pauseAndReap() {
 	}
 }
 
-// search is a helper function to find tests in the given directory that match
+// Search is a helper function to find tests in the given directory that match
 // the regex.
-func search(root string, testFilter *regexp.Regexp) ([]string, error) {
+func Search(root string, testFilter *regexp.Regexp) ([]string, error) {
 	var testSlice []string
 
 	err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
diff --git a/test/runtimes/proctor/proctor_test.go b/test/runtimes/proctor/lib/lib_test.go
index 6ef2de085..1193d2e28 100644
--- a/test/runtimes/proctor/proctor_test.go
+++ b/test/runtimes/proctor/lib/lib_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"io/ioutil"
@@ -47,7 +47,7 @@ func TestSearchEmptyDir(t *testing.T) {
 	var want []string
 
 	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
-	got, err := search(td, testFilter)
+	got, err := Search(td, testFilter)
 	if err != nil {
 		t.Errorf("search error: %v", err)
 	}
@@ -116,7 +116,7 @@ func TestSearch(t *testing.T) {
 	}
 
 	testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`)
-	got, err := search(td, testFilter)
+	got, err := Search(td, testFilter)
 	if err != nil {
 		t.Errorf("search error: %v", err)
 	}
diff --git a/test/runtimes/proctor/nodejs.go b/test/runtimes/proctor/lib/nodejs.go
index dead5af4f..320597aa5 100644
--- a/test/runtimes/proctor/nodejs.go
+++ b/test/runtimes/proctor/lib/nodejs.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"os/exec"
@@ -32,7 +32,7 @@ var _ TestRunner = nodejsRunner{}
 
 // ListTests implements TestRunner.ListTests.
 func (nodejsRunner) ListTests() ([]string, error) {
-	testSlice, err := search(nodejsTestDir, nodejsTestRegEx)
+	testSlice, err := Search(nodejsTestDir, nodejsTestRegEx)
 	if err != nil {
 		return nil, err
 	}
diff --git a/test/runtimes/proctor/php.go b/test/runtimes/proctor/lib/php.go
index 6a83d64e3..b67a60a97 100644
--- a/test/runtimes/proctor/php.go
+++ b/test/runtimes/proctor/lib/php.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"os/exec"
@@ -29,7 +29,7 @@ var _ TestRunner = phpRunner{}
 
 // ListTests implements TestRunner.ListTests.
 func (phpRunner) ListTests() ([]string, error) {
-	testSlice, err := search(".", phpTestRegEx)
+	testSlice, err := Search(".", phpTestRegEx)
 	if err != nil {
 		return nil, err
 	}
diff --git a/test/runtimes/proctor/python.go b/test/runtimes/proctor/lib/python.go
index 7c598801b..429bfd850 100644
--- a/test/runtimes/proctor/python.go
+++ b/test/runtimes/proctor/lib/python.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"fmt"
diff --git a/test/runtimes/proctor/main.go b/test/runtimes/proctor/main.go
new file mode 100644
index 000000000..81cb68381
--- /dev/null
+++ b/test/runtimes/proctor/main.go
@@ -0,0 +1,113 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary proctor runs the test for a particular runtime. It is meant to be
+// included in Docker images for all runtime tests.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"strings"
+	"syscall"
+
+	"gvisor.dev/gvisor/test/runtimes/proctor/lib"
+)
+
+var (
+	runtime   = flag.String("runtime", "", "name of runtime")
+	list      = flag.Bool("list", false, "list all available tests")
+	testNames = flag.String("tests", "", "run a subset of the available tests")
+	pause     = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
+)
+
+// setNumFilesLimit changes the NOFILE soft rlimit if it is too high.
+func setNumFilesLimit() error {
+	// In docker containers, the default value of the NOFILE limit is
+	// 1048576. A few runtime tests (e.g. python:test_subprocess)
+	// enumerates all possible file descriptors and these tests can fail by
+	// timeout if the NOFILE limit is too high. On gVisor, syscalls are
+	// slower so these tests will need even more time to pass.
+	const nofile = 32768
+	rLimit := syscall.Rlimit{}
+	err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit)
+	if err != nil {
+		return fmt.Errorf("failed to get RLIMIT_NOFILE: %v", err)
+	}
+	if rLimit.Cur > nofile {
+		rLimit.Cur = nofile
+		err := syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit)
+		if err != nil {
+			return fmt.Errorf("failed to set RLIMIT_NOFILE: %v", err)
+		}
+	}
+	return nil
+}
+
+func main() {
+	flag.Parse()
+
+	if *pause {
+		lib.PauseAndReap()
+		panic("pauseAndReap should never return")
+	}
+
+	if *runtime == "" {
+		log.Fatalf("runtime flag must be provided")
+	}
+
+	tr, err := lib.TestRunnerForRuntime(*runtime)
+	if err != nil {
+		log.Fatalf("%v", err)
+	}
+
+	// List tests.
+	if *list {
+		tests, err := tr.ListTests()
+		if err != nil {
+			log.Fatalf("failed to list tests: %v", err)
+		}
+		for _, test := range tests {
+			fmt.Println(test)
+		}
+		return
+	}
+
+	var tests []string
+	if *testNames == "" {
+		// Run every test.
+		tests, err = tr.ListTests()
+		if err != nil {
+			log.Fatalf("failed to get all tests: %v", err)
+		}
+	} else {
+		// Run subset of test.
+		tests = strings.Split(*testNames, ",")
+	}
+
+	if err := setNumFilesLimit(); err != nil {
+		log.Fatalf("%v", err)
+	}
+
+	// Run tests.
+	cmds := tr.TestCmds(tests)
+	for _, cmd := range cmds {
+		cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
+		if err := cmd.Run(); err != nil {
+			log.Fatalf("FAIL: %v", err)
+		}
+	}
+}
diff --git a/test/runtimes/runner/BUILD b/test/runtimes/runner/BUILD
index dc0d5d5b4..70cc01594 100644
--- a/test/runtimes/runner/BUILD
+++ b/test/runtimes/runner/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -7,16 +7,5 @@ go_binary(
     testonly = 1,
     srcs = ["main.go"],
     visibility = ["//test/runtimes:__pkg__"],
-    deps = [
-        "//pkg/log",
-        "//pkg/test/dockerutil",
-        "//pkg/test/testutil",
-    ],
-)
-
-go_test(
-    name = "exclude_test",
-    size = "small",
-    srcs = ["exclude_test.go"],
-    library = ":runner",
+    deps = ["//test/runtimes/runner/lib"],
 )
diff --git a/test/runtimes/runner/lib/BUILD b/test/runtimes/runner/lib/BUILD
new file mode 100644
index 000000000..d308f41b0
--- /dev/null
+++ b/test/runtimes/runner/lib/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "lib",
+    testonly = 1,
+    srcs = ["lib.go"],
+    visibility = ["//test/runtimes/runner:__pkg__"],
+    deps = [
+        "//pkg/log",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
+    ],
+)
+
+go_test(
+    name = "lib_test",
+    size = "small",
+    srcs = ["exclude_test.go"],
+    library = ":lib",
+)
diff --git a/test/runtimes/runner/exclude_test.go b/test/runtimes/runner/lib/exclude_test.go
index 67c2170c8..f996e895b 100644
--- a/test/runtimes/runner/exclude_test.go
+++ b/test/runtimes/runner/lib/exclude_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package main
+package lib
 
 import (
 	"flag"
@@ -20,6 +20,8 @@ import (
 	"testing"
 )
 
+var excludeFile = flag.String("exclude_file", "", "file to test (standard format)")
+
 func TestMain(m *testing.M) {
 	flag.Parse()
 	os.Exit(m.Run())
@@ -27,7 +29,7 @@ func TestMain(m *testing.M) {
 
 // Test that the exclude file parses without error.
 func TestExcludelist(t *testing.T) {
-	ex, err := getExcludes()
+	ex, err := getExcludes(*excludeFile)
 	if err != nil {
 		t.Fatalf("error parsing exclude file: %v", err)
 	}
diff --git a/test/runtimes/runner/lib/lib.go b/test/runtimes/runner/lib/lib.go
new file mode 100644
index 000000000..78285cb0e
--- /dev/null
+++ b/test/runtimes/runner/lib/lib.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lib provides utilities for runner.
+package lib
+
+import (
+	"context"
+	"encoding/csv"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+// RunTests is a helper that is called by main. It exists so that we can run
+// defered functions before exiting. It returns an exit code that should be
+// passed to os.Exit.
+func RunTests(lang, image, excludeFile string, batchSize int, timeout time.Duration) int {
+	// Get tests to exclude..
+	excludes, err := getExcludes(excludeFile)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error getting exclude list: %s\n", err.Error())
+		return 1
+	}
+
+	// Construct the shared docker instance.
+	ctx := context.Background()
+	d := dockerutil.MakeContainer(ctx, testutil.DefaultLogger(lang))
+	defer d.CleanUp(ctx)
+
+	if err := testutil.TouchShardStatusFile(); err != nil {
+		fmt.Fprintf(os.Stderr, "error touching status shard file: %v\n", err)
+		return 1
+	}
+
+	// Get a slice of tests to run. This will also start a single Docker
+	// container that will be used to run each test. The final test will
+	// stop the Docker container.
+	tests, err := getTests(ctx, d, lang, image, batchSize, timeout, excludes)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%s\n", err.Error())
+		return 1
+	}
+
+	m := testing.MainStart(testDeps{}, tests, nil, nil)
+	return m.Run()
+}
+
+// getTests executes all tests as table tests.
+func getTests(ctx context.Context, d *dockerutil.Container, lang, image string, batchSize int, timeout time.Duration, excludes map[string]struct{}) ([]testing.InternalTest, error) {
+	// Start the container.
+	opts := dockerutil.RunOpts{
+		Image: fmt.Sprintf("runtimes/%s", image),
+	}
+	d.CopyFiles(&opts, "/proctor", "test/runtimes/proctor/proctor")
+	if err := d.Spawn(ctx, opts, "/proctor/proctor", "--pause"); err != nil {
+		return nil, fmt.Errorf("docker run failed: %v", err)
+	}
+
+	// Get a list of all tests in the image.
+	list, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", lang, "--list")
+	if err != nil {
+		return nil, fmt.Errorf("docker exec failed: %v", err)
+	}
+
+	// Calculate a subset of tests to run corresponding to the current
+	// shard.
+	tests := strings.Fields(list)
+	sort.Strings(tests)
+	indices, err := testutil.TestIndicesForShard(len(tests))
+	if err != nil {
+		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
+	}
+
+	var itests []testing.InternalTest
+	for i := 0; i < len(indices); i += batchSize {
+		var tcs []string
+		end := i + batchSize
+		if end > len(indices) {
+			end = len(indices)
+		}
+		for _, tc := range indices[i:end] {
+			// Add test if not excluded.
+			if _, ok := excludes[tests[tc]]; ok {
+				log.Infof("Skipping test case %s\n", tests[tc])
+				continue
+			}
+			tcs = append(tcs, tests[tc])
+		}
+		itests = append(itests, testing.InternalTest{
+			Name: strings.Join(tcs, ", "),
+			F: func(t *testing.T) {
+				var (
+					now    = time.Now()
+					done   = make(chan struct{})
+					output string
+					err    error
+				)
+
+				go func() {
+					fmt.Printf("RUNNING the following in a batch\n%s\n", strings.Join(tcs, "\n"))
+					output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", lang, "--tests", strings.Join(tcs, ","))
+					close(done)
+				}()
+
+				select {
+				case <-done:
+					if err == nil {
+						fmt.Printf("PASS: (%v)\n\n", time.Since(now))
+						return
+					}
+					t.Errorf("FAIL: (%v):\n%s\n", time.Since(now), output)
+				case <-time.After(timeout):
+					t.Errorf("TIMEOUT: (%v):\n%s\n", time.Since(now), output)
+				}
+			},
+		})
+	}
+
+	return itests, nil
+}
+
+// getBlacklist reads the exclude file and returns a set of test names to
+// exclude.
+func getExcludes(excludeFile string) (map[string]struct{}, error) {
+	excludes := make(map[string]struct{})
+	if excludeFile == "" {
+		return excludes, nil
+	}
+	f, err := os.Open(excludeFile)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	r := csv.NewReader(f)
+
+	// First line is header. Skip it.
+	if _, err := r.Read(); err != nil {
+		return nil, err
+	}
+
+	for {
+		record, err := r.Read()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+		excludes[record[0]] = struct{}{}
+	}
+	return excludes, nil
+}
+
+// testDeps implements testing.testDeps (an unexported interface), and is
+// required to use testing.MainStart.
+type testDeps struct{}
+
+func (f testDeps) MatchString(a, b string) (bool, error)       { return a == b, nil }
+func (f testDeps) StartCPUProfile(io.Writer) error             { return nil }
+func (f testDeps) StopCPUProfile()                             {}
+func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil }
+func (f testDeps) ImportPath() string                          { return "" }
+func (f testDeps) StartTestLog(io.Writer)                      {}
+func (f testDeps) StopTestLog() error                          { return nil }
diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go
index 948e7cf9c..ec79a22c2 100644
--- a/test/runtimes/runner/main.go
+++ b/test/runtimes/runner/main.go
@@ -16,20 +16,12 @@
 package main
 
 import (
-	"context"
-	"encoding/csv"
 	"flag"
 	"fmt"
-	"io"
 	"os"
-	"sort"
-	"strings"
-	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/test/dockerutil"
-	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/test/runtimes/runner/lib"
 )
 
 var (
@@ -37,169 +29,14 @@ var (
 	image       = flag.String("image", "", "docker image with runtime tests")
 	excludeFile = flag.String("exclude_file", "", "file containing list of tests to exclude, in CSV format with fields: test name, bug id, comment")
 	batchSize   = flag.Int("batch", 50, "number of test cases run in one command")
+	timeout     = flag.Duration("timeout", 90*time.Minute, "batch timeout")
 )
 
-// Wait time for each test to run.
-const timeout = 90 * time.Minute
-
 func main() {
 	flag.Parse()
 	if *lang == "" || *image == "" {
 		fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n")
 		os.Exit(1)
 	}
-	os.Exit(runTests())
-}
-
-// runTests is a helper that is called by main. It exists so that we can run
-// defered functions before exiting. It returns an exit code that should be
-// passed to os.Exit.
-func runTests() int {
-	// Get tests to exclude..
-	excludes, err := getExcludes()
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error getting exclude list: %s\n", err.Error())
-		return 1
-	}
-
-	// Construct the shared docker instance.
-	ctx := context.Background()
-	d := dockerutil.MakeContainer(ctx, testutil.DefaultLogger(*lang))
-	defer d.CleanUp(ctx)
-
-	if err := testutil.TouchShardStatusFile(); err != nil {
-		fmt.Fprintf(os.Stderr, "error touching status shard file: %v\n", err)
-		return 1
-	}
-
-	// Get a slice of tests to run. This will also start a single Docker
-	// container that will be used to run each test. The final test will
-	// stop the Docker container.
-	tests, err := getTests(ctx, d, excludes)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "%s\n", err.Error())
-		return 1
-	}
-
-	m := testing.MainStart(testDeps{}, tests, nil, nil)
-	return m.Run()
-}
-
-// getTests executes all tests as table tests.
-func getTests(ctx context.Context, d *dockerutil.Container, excludes map[string]struct{}) ([]testing.InternalTest, error) {
-	// Start the container.
-	opts := dockerutil.RunOpts{
-		Image: fmt.Sprintf("runtimes/%s", *image),
-	}
-	d.CopyFiles(&opts, "/proctor", "test/runtimes/proctor/proctor")
-	if err := d.Spawn(ctx, opts, "/proctor/proctor", "--pause"); err != nil {
-		return nil, fmt.Errorf("docker run failed: %v", err)
-	}
-
-	// Get a list of all tests in the image.
-	list, err := d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", *lang, "--list")
-	if err != nil {
-		return nil, fmt.Errorf("docker exec failed: %v", err)
-	}
-
-	// Calculate a subset of tests to run corresponding to the current
-	// shard.
-	tests := strings.Fields(list)
-	sort.Strings(tests)
-	indices, err := testutil.TestIndicesForShard(len(tests))
-	if err != nil {
-		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
-	}
-
-	var itests []testing.InternalTest
-	for i := 0; i < len(indices); i += *batchSize {
-		var tcs []string
-		end := i + *batchSize
-		if end > len(indices) {
-			end = len(indices)
-		}
-		for _, tc := range indices[i:end] {
-			// Add test if not excluded.
-			if _, ok := excludes[tests[tc]]; ok {
-				log.Infof("Skipping test case %s\n", tests[tc])
-				continue
-			}
-			tcs = append(tcs, tests[tc])
-		}
-		itests = append(itests, testing.InternalTest{
-			Name: strings.Join(tcs, ", "),
-			F: func(t *testing.T) {
-				var (
-					now    = time.Now()
-					done   = make(chan struct{})
-					output string
-					err    error
-				)
-
-				go func() {
-					fmt.Printf("RUNNING the following in a batch\n%s\n", strings.Join(tcs, "\n"))
-					output, err = d.Exec(ctx, dockerutil.ExecOpts{}, "/proctor/proctor", "--runtime", *lang, "--tests", strings.Join(tcs, ","))
-					close(done)
-				}()
-
-				select {
-				case <-done:
-					if err == nil {
-						fmt.Printf("PASS: (%v)\n\n", time.Since(now))
-						return
-					}
-					t.Errorf("FAIL: (%v):\n%s\n", time.Since(now), output)
-				case <-time.After(timeout):
-					t.Errorf("TIMEOUT: (%v):\n%s\n", time.Since(now), output)
-				}
-			},
-		})
-	}
-
-	return itests, nil
+	os.Exit(lib.RunTests(*lang, *image, *excludeFile, *batchSize, *timeout))
 }
-
-// getBlacklist reads the exclude file and returns a set of test names to
-// exclude.
-func getExcludes() (map[string]struct{}, error) {
-	excludes := make(map[string]struct{})
-	if *excludeFile == "" {
-		return excludes, nil
-	}
-	f, err := os.Open(*excludeFile)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	r := csv.NewReader(f)
-
-	// First line is header. Skip it.
-	if _, err := r.Read(); err != nil {
-		return nil, err
-	}
-
-	for {
-		record, err := r.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
-		excludes[record[0]] = struct{}{}
-	}
-	return excludes, nil
-}
-
-// testDeps implements testing.testDeps (an unexported interface), and is
-// required to use testing.MainStart.
-type testDeps struct{}
-
-func (f testDeps) MatchString(a, b string) (bool, error)       { return a == b, nil }
-func (f testDeps) StartCPUProfile(io.Writer) error             { return nil }
-func (f testDeps) StopCPUProfile()                             {}
-func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil }
-func (f testDeps) ImportPath() string                          { return "" }
-func (f testDeps) StartTestLog(io.Writer)                      {}
-func (f testDeps) StopTestLog() error                          { return nil }
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 0eadc6b08..f66a9ceb4 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -64,6 +64,8 @@ syscall_test(
 
 syscall_test(
     size = "large",
+    # Produce too many logs in the debug mode.
+    debug = False,
     shard_count = 50,
     # Takes too long for TSAN. Since this is kind of a stress test that doesn't
     # involve much concurrency, TSAN's usefulness here is limited anyway.
@@ -236,7 +238,7 @@ syscall_test(
 
 syscall_test(
     size = "medium",
-    add_overlay = False,  # TODO(gvisor.dev/issue/317): enable when fixed.
+    add_overlay = True,
     test = "//test/syscalls/linux:inotify_test",
 )
 
@@ -251,12 +253,20 @@ syscall_test(
 )
 
 syscall_test(
+    test = "//test/syscalls/linux:ip6tables_test",
+)
+
+syscall_test(
     size = "large",
     shard_count = 5,
     test = "//test/syscalls/linux:itimer_test",
 )
 
 syscall_test(
+    test = "//test/syscalls/linux:kcov_test",
+)
+
+syscall_test(
     test = "//test/syscalls/linux:kill_test",
 )
 
@@ -276,6 +286,10 @@ syscall_test(
 )
 
 syscall_test(
+    test = "//test/syscalls/linux:membarrier_test",
+)
+
+syscall_test(
     test = "//test/syscalls/linux:memory_accounting_test",
 )
 
@@ -662,6 +676,21 @@ syscall_test(
 )
 
 syscall_test(
+    size = "medium",
+    # Takes too long under gotsan to run.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_nogotsan_test",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_netlink_test",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:socket_ipv6_udp_unbound_loopback_netlink_test",
+)
+
+syscall_test(
     test = "//test/syscalls/linux:socket_ip_unbound_test",
 )
 
@@ -778,6 +807,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:statfs_test",
+    use_tmpfs = True,  # Test specifically relies on TEST_TMPDIR to be tmpfs.
 )
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6299870bc..c94c1d5bd 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -22,6 +22,7 @@ exports_files(
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
+        "socket_ipv4_udp_unbound_loopback_nogotsan.cc",
         "tcp_socket.cc",
         "udp_bind.cc",
         "udp_socket.cc",
@@ -1030,6 +1031,24 @@ cc_binary(
 )
 
 cc_binary(
+    name = "ip6tables_test",
+    testonly = 1,
+    srcs = [
+        "ip6tables.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":iptables_types",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
     name = "itimer_test",
     testonly = 1,
     srcs = ["itimer.cc"],
@@ -1050,6 +1069,21 @@ cc_binary(
 )
 
 cc_binary(
+    name = "kcov_test",
+    testonly = 1,
+    srcs = ["kcov.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
     name = "kill_test",
     testonly = 1,
     srcs = ["kill.cc"],
@@ -1122,6 +1156,24 @@ cc_binary(
 )
 
 cc_binary(
+    name = "membarrier_test",
+    testonly = 1,
+    srcs = ["membarrier.cc"],
+    linkstatic = 1,
+    deps = [
+        "@com_google_absl//absl/time",
+        gtest,
+        "//test/util:cleanup",
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
     name = "mempolicy_test",
     testonly = 1,
     srcs = ["mempolicy.cc"],
@@ -1233,6 +1285,7 @@ cc_binary(
         "//test/util:mount_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -1634,12 +1687,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
+        "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -1954,6 +2009,7 @@ cc_binary(
         gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
+        "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -2378,12 +2434,51 @@ cc_library(
         ":socket_test_util",
         "@com_google_absl//absl/memory",
         gtest,
+        "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:test_util",
     ],
     alwayslink = 1,
 )
 
 cc_library(
+    name = "socket_ipv4_udp_unbound_netlink_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_netlink.cc",
+    ],
+    hdrs = [
+        "socket_ipv4_udp_unbound_netlink.h",
+    ],
+    deps = [
+        ":socket_netlink_route_util",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        gtest,
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_ipv6_udp_unbound_netlink_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv6_udp_unbound_netlink.cc",
+    ],
+    hdrs = [
+        "socket_ipv6_udp_unbound_netlink.h",
+    ],
+    deps = [
+        ":socket_netlink_route_util",
+        ":socket_test_util",
+        "//test/util:capability_util",
+        gtest,
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "socket_ipv4_udp_unbound_external_networking_test_cases",
     testonly = 1,
     srcs = [
@@ -2720,6 +2815,55 @@ cc_binary(
 )
 
 cc_binary(
+    name = "socket_ipv4_udp_unbound_loopback_nogotsan_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_loopback_nogotsan.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_binary(
+    name = "socket_ipv4_udp_unbound_loopback_netlink_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_loopback_netlink.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv4_udp_unbound_netlink_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ipv6_udp_unbound_loopback_netlink_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv6_udp_unbound_loopback_netlink.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv6_udp_unbound_netlink_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
     name = "socket_ip_unbound_test",
     testonly = 1,
     srcs = [
@@ -3299,6 +3443,7 @@ cc_binary(
         "@com_google_absl//absl/strings",
         gtest,
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -3482,6 +3627,7 @@ cc_binary(
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "//test/util:timer_util",
     ],
 )
 
@@ -3548,15 +3694,12 @@ cc_binary(
     ],
 )
 
-cc_library(
-    name = "udp_socket_test_cases",
+cc_binary(
+    name = "udp_socket_test",
     testonly = 1,
-    srcs = [
-        "udp_socket_errqueue_test_case.cc",
-        "udp_socket_test_cases.cc",
-    ],
-    hdrs = ["udp_socket_test_cases.h"],
+    srcs = ["udp_socket.cc"],
     defines = select_system(),
+    linkstatic = 1,
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
@@ -3571,17 +3714,6 @@ cc_library(
         "//test/util:test_util",
         "//test/util:thread_util",
     ],
-    alwayslink = 1,
-)
-
-cc_binary(
-    name = "udp_socket_test",
-    testonly = 1,
-    srcs = ["udp_socket.cc"],
-    linkstatic = 1,
-    deps = [
-        ":udp_socket_test_cases",
-    ],
 )
 
 cc_binary(
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 18d2f22c1..3797fd4c8 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -1042,6 +1042,13 @@ class ElfInterpreterStaticTest
 
 // Statically linked ELF with a statically linked ELF interpreter.
 TEST_P(ElfInterpreterStaticTest, Test) {
+  // TODO(gvisor.dev/issue/3721): Test has been observed to segfault on 5.X
+  // kernels.
+  if (!IsRunningOnGvisor()) {
+    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+    SKIP_IF(version.major > 4);
+  }
+
   const std::vector<char> segment_suffix = std::get<0>(GetParam());
   const int expected_errno = std::get<1>(GetParam());
 
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index cabc2b751..edd23e063 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -179,6 +179,12 @@ TEST_F(AllocateTest, FallocateOtherFDs) {
   auto sock0 = FileDescriptor(socks[0]);
   auto sock1 = FileDescriptor(socks[1]);
   EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+
+  int pipefds[2];
+  ASSERT_THAT(pipe(pipefds), SyscallSucceeds());
+  EXPECT_THAT(fallocate(pipefds[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE));
+  close(pipefds[0]);
+  close(pipefds[1]);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index 549141cbb..b286e84fe 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -216,14 +216,29 @@ TEST_F(FlockTest, TestSharedLockFailExclusiveHolderBlocking_NoRandomSave) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
 
-  // Register a signal handler for SIGALRM and set an alarm that will go off
-  // while blocking in the subsequent flock() call. This will interrupt flock()
-  // and cause it to return EINTR.
+  // Make sure that a blocking flock() call will return EINTR when interrupted
+  // by a signal. Create a timer that will go off while blocking on flock(), and
+  // register the corresponding signal handler.
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(
+      TimerCreate(CLOCK_MONOTONIC, sigevent_t{
+                                       .sigev_signo = SIGALRM,
+                                       .sigev_notify = SIGEV_SIGNAL,
+                                   }));
+
   struct sigaction act = {};
   act.sa_handler = trivial_handler;
   ASSERT_THAT(sigaction(SIGALRM, &act, NULL), SyscallSucceeds());
-  ASSERT_THAT(ualarm(10000, 0), SyscallSucceeds());
+
+  // Now that the signal handler is registered, set the timer. Set an interval
+  // so that it's ok if the timer goes off before we call flock.
+  ASSERT_NO_ERRNO(
+      timer.Set(0, itimerspec{
+                       .it_interval = absl::ToTimespec(absl::Milliseconds(10)),
+                       .it_value = absl::ToTimespec(absl::Milliseconds(10)),
+                   }));
+
   ASSERT_THAT(flock(fd.get(), LOCK_SH), SyscallFailsWithErrno(EINTR));
+  timer.reset();
 
   // Unlock
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
@@ -258,14 +273,29 @@ TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolderBlocking_NoRandomSave) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
 
-  // Register a signal handler for SIGALRM and set an alarm that will go off
-  // while blocking in the subsequent flock() call. This will interrupt flock()
-  // and cause it to return EINTR.
+  // Make sure that a blocking flock() call will return EINTR when interrupted
+  // by a signal. Create a timer that will go off while blocking on flock(), and
+  // register the corresponding signal handler.
+  auto timer = ASSERT_NO_ERRNO_AND_VALUE(
+      TimerCreate(CLOCK_MONOTONIC, sigevent_t{
+                                       .sigev_signo = SIGALRM,
+                                       .sigev_notify = SIGEV_SIGNAL,
+                                   }));
+
   struct sigaction act = {};
   act.sa_handler = trivial_handler;
   ASSERT_THAT(sigaction(SIGALRM, &act, NULL), SyscallSucceeds());
-  ASSERT_THAT(ualarm(10000, 0), SyscallSucceeds());
+
+  // Now that the signal handler is registered, set the timer. Set an interval
+  // so that it's ok if the timer goes off before we call flock.
+  ASSERT_NO_ERRNO(
+      timer.Set(0, itimerspec{
+                       .it_interval = absl::ToTimespec(absl::Milliseconds(10)),
+                       .it_value = absl::ToTimespec(absl::Milliseconds(10)),
+                   }));
+
   ASSERT_THAT(flock(fd.get(), LOCK_EX), SyscallFailsWithErrno(EINTR));
+  timer.reset();
 
   // Unlock
   ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 5cb325a9e..e4392a450 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -465,7 +465,9 @@ TEST(Inotify, ConcurrentFileDeletionAndWatchRemoval) {
     for (int i = 0; i < 100; ++i) {
       FileDescriptor file_fd =
           ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT, S_IRUSR | S_IWUSR));
-      file_fd.reset();  // Close before unlinking (although save is disabled).
+      // Close before unlinking (although S/R is disabled). Some filesystems
+      // cannot restore an open fd on an unlinked file.
+      file_fd.reset();
       EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
     }
   };
@@ -1256,10 +1258,7 @@ TEST(Inotify, MknodGeneratesCreateEvent) {
       InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
 
   const TempPath file1(root.path() + "/file1");
-  const int rc = mknod(file1.path().c_str(), S_IFREG, 0);
-  // mknod(2) is only supported on tmpfs in the sandbox.
-  SKIP_IF(IsRunningOnGvisor() && rc != 0);
-  ASSERT_THAT(rc, SyscallSucceeds());
+  ASSERT_THAT(mknod(file1.path().c_str(), S_IFREG, 0), SyscallSucceeds());
 
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
@@ -1289,6 +1288,10 @@ TEST(Inotify, SymlinkGeneratesCreateEvent) {
 }
 
 TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
+  // Inotify does not work properly with hard links in gofer and overlay fs.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(GetAbsoluteTestTmpdir())));
+
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
@@ -1301,11 +1304,8 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
   const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
       InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
 
-  const int rc = link(file1.path().c_str(), link1.path().c_str());
-  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
-  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
-          (errno == EPERM || errno == ENOENT));
-  ASSERT_THAT(rc, SyscallSucceeds());
+  ASSERT_THAT(link(file1.path().c_str(), link1.path().c_str()),
+              SyscallSucceeds());
 
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
@@ -1334,66 +1334,70 @@ TEST(Inotify, UtimesGeneratesAttribEvent) {
 }
 
 TEST(Inotify, HardlinksReuseSameWatch) {
+  // Inotify does not work properly with hard links in gofer and overlay fs.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(GetAbsoluteTestTmpdir())));
+
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  TempPath file1 =
+  TempPath file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
-  TempPath link1(root.path() + "/link1");
-  const int rc = link(file1.path().c_str(), link1.path().c_str());
-  // link(2) is only supported on tmpfs in the sandbox.
-  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
-          (errno == EPERM || errno == ENOENT));
-  ASSERT_THAT(rc, SyscallSucceeds());
+
+  TempPath file2(root.path() + "/file2");
+  ASSERT_THAT(link(file.path().c_str(), file2.path().c_str()),
+              SyscallSucceeds());
 
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
 
   const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
       InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
-  const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
-  const int link1_wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), link1.path(), IN_ALL_EVENTS));
+  const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file.path(), IN_ALL_EVENTS));
+  const int file2_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file2.path(), IN_ALL_EVENTS));
 
   // The watch descriptors for watches on different links to the same file
   // should be identical.
-  EXPECT_NE(root_wd, file1_wd);
-  EXPECT_EQ(file1_wd, link1_wd);
+  EXPECT_NE(root_wd, file_wd);
+  EXPECT_EQ(file_wd, file2_wd);
 
-  FileDescriptor file1_fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+  FileDescriptor file_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
 
   std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
   ASSERT_THAT(events,
-              AreUnordered({Event(IN_OPEN, root_wd, Basename(file1.path())),
-                            Event(IN_OPEN, file1_wd)}));
+              AreUnordered({Event(IN_OPEN, root_wd, Basename(file.path())),
+                            Event(IN_OPEN, file_wd)}));
 
   // For the next step, we want to ensure all fds to the file are closed. Do
   // that now and drain the resulting events.
-  file1_fd.reset();
+  file_fd.reset();
   events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
-  ASSERT_THAT(events,
-              Are({Event(IN_CLOSE_WRITE, root_wd, Basename(file1.path())),
-                   Event(IN_CLOSE_WRITE, file1_wd)}));
+  ASSERT_THAT(
+      events,
+      AreUnordered({Event(IN_CLOSE_WRITE, root_wd, Basename(file.path())),
+                    Event(IN_CLOSE_WRITE, file_wd)}));
 
   // Try removing the link and let's see what events show up. Note that after
   // this, we still have a link to the file so the watch shouldn't be
   // automatically removed.
-  const std::string link1_path = link1.reset();
+  const std::string file2_path = file2.reset();
 
   events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
-  ASSERT_THAT(events, Are({Event(IN_ATTRIB, link1_wd),
-                           Event(IN_DELETE, root_wd, Basename(link1_path))}));
+  ASSERT_THAT(events,
+              AreUnordered({Event(IN_ATTRIB, file2_wd),
+                            Event(IN_DELETE, root_wd, Basename(file2_path))}));
 
   // Now remove the other link. Since this is the last link to the file, the
   // watch should be automatically removed.
-  const std::string file1_path = file1.reset();
+  const std::string file_path = file.reset();
 
   events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
   ASSERT_THAT(
       events,
-      AreUnordered({Event(IN_ATTRIB, file1_wd), Event(IN_DELETE_SELF, file1_wd),
-                    Event(IN_IGNORED, file1_wd),
-                    Event(IN_DELETE, root_wd, Basename(file1_path))}));
+      AreUnordered({Event(IN_ATTRIB, file_wd), Event(IN_DELETE_SELF, file_wd),
+                    Event(IN_IGNORED, file_wd),
+                    Event(IN_DELETE, root_wd, Basename(file_path))}));
 }
 
 // Calling mkdir within "parent/child" should generate an event for child, but
@@ -1804,17 +1808,17 @@ TEST(Inotify, SpliceOnInotifyFD) {
 // Watches on a parent should not be triggered by actions on a hard link to one
 // of its children that has a different parent.
 TEST(Inotify, LinkOnOtherParent) {
+  // Inotify does not work properly with hard links in gofer and overlay fs.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(GetAbsoluteTestTmpdir())));
+
   const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const TempPath file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
   std::string link_path = NewTempAbsPathInDir(dir2.path());
 
-  const int rc = link(file.path().c_str(), link_path.c_str());
-  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
-  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
-          (errno == EPERM || errno == ENOENT));
-  ASSERT_THAT(rc, SyscallSucceeds());
+  ASSERT_THAT(link(file.path().c_str(), link_path.c_str()), SyscallSucceeds());
 
   const FileDescriptor inotify_fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -1823,13 +1827,18 @@ TEST(Inotify, LinkOnOtherParent) {
 
   // Perform various actions on the link outside of dir1, which should trigger
   // no inotify events.
-  const FileDescriptor fd =
+  FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR));
   int val = 0;
   ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
   ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
   ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+
+  // Close before unlinking; some filesystems cannot restore an open fd on an
+  // unlinked file.
+  fd.reset();
   ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
   EXPECT_THAT(events, Are({}));
@@ -1934,14 +1943,22 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) {
   ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
   std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
-  EXPECT_THAT(events, Are({
-                          Event(IN_ATTRIB, file_wd),
-                          Event(IN_DELETE, dir_wd, Basename(file.path())),
-                          Event(IN_ACCESS, dir_wd, Basename(file.path())),
-                          Event(IN_ACCESS, file_wd),
-                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
-                          Event(IN_MODIFY, file_wd),
-                      }));
+  EXPECT_THAT(events, AnyOf(Are({
+                                Event(IN_ATTRIB, file_wd),
+                                Event(IN_DELETE, dir_wd, Basename(file.path())),
+                                Event(IN_ACCESS, dir_wd, Basename(file.path())),
+                                Event(IN_ACCESS, file_wd),
+                                Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                                Event(IN_MODIFY, file_wd),
+                            }),
+                            Are({
+                                Event(IN_DELETE, dir_wd, Basename(file.path())),
+                                Event(IN_ATTRIB, file_wd),
+                                Event(IN_ACCESS, dir_wd, Basename(file.path())),
+                                Event(IN_ACCESS, file_wd),
+                                Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                                Event(IN_MODIFY, file_wd),
+                            })));
 
   fd.reset();
   events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
@@ -1984,7 +2001,7 @@ TEST(Inotify, ExcludeUnlink_NoRandomSave) {
   ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
   std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
-  EXPECT_THAT(events, Are({
+  EXPECT_THAT(events, AreUnordered({
                           Event(IN_ATTRIB, file_wd),
                           Event(IN_DELETE, dir_wd, Basename(file.path())),
                       }));
@@ -2045,21 +2062,21 @@ TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) {
 // We need to disable S/R because there are filesystems where we cannot re-open
 // fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
 TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
-  const DisableSave ds;
+  // Inotify does not work properly with hard links in gofer and overlay fs.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(GetAbsoluteTestTmpdir())));
   // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
   SKIP_IF(IsRunningWithVFS1());
 
+  const DisableSave ds;
+
   const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const TempPath file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
   std::string path1 = file.path();
   std::string path2 = NewTempAbsPathInDir(dir.path());
+  ASSERT_THAT(link(path1.c_str(), path2.c_str()), SyscallSucceeds());
 
-  const int rc = link(path1.c_str(), path2.c_str());
-  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
-  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
-          (errno == EPERM || errno == ENOENT));
-  ASSERT_THAT(rc, SyscallSucceeds());
   const FileDescriptor fd1 =
       ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR));
   const FileDescriptor fd2 =
@@ -2091,6 +2108,15 @@ TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
 // We need to disable S/R because there are filesystems where we cannot re-open
 // fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
 TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1624): Fails on VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // NOTE(gvisor.dev/issue/3654): In the gofer filesystem, we do not allow
+  // setting attributes through an fd if the file at the open path has been
+  // deleted.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(GetAbsoluteTestTmpdir())));
+
   const DisableSave ds;
 
   const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
@@ -2100,18 +2126,6 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR));
 
-  // NOTE(b/157163751): Create another link before unlinking. This is needed for
-  // the gofer filesystem in gVisor, where open fds will not work once the link
-  // count hits zero. In VFS2, we end up skipping the gofer test anyway, because
-  // hard links are not supported for gofer fs.
-  if (IsRunningOnGvisor()) {
-    std::string link_path = NewTempAbsPath();
-    const int rc = link(file.path().c_str(), link_path.c_str());
-    // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
-    SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT));
-    ASSERT_THAT(rc, SyscallSucceeds());
-  }
-
   const FileDescriptor inotify_fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
   const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
@@ -2127,12 +2141,18 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
   ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
   std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
-  EXPECT_THAT(events, Are({
-                          Event(IN_ATTRIB, file_wd),
-                          Event(IN_DELETE, dir_wd, Basename(file.path())),
-                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
-                          Event(IN_MODIFY, file_wd),
-                      }));
+  EXPECT_THAT(events, AnyOf(Are({
+                                Event(IN_ATTRIB, file_wd),
+                                Event(IN_DELETE, dir_wd, Basename(file.path())),
+                                Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                                Event(IN_MODIFY, file_wd),
+                            }),
+                            Are({
+                                Event(IN_DELETE, dir_wd, Basename(file.path())),
+                                Event(IN_ATTRIB, file_wd),
+                                Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                                Event(IN_MODIFY, file_wd),
+                            })));
 
   const struct timeval times[2] = {{1, 0}, {2, 0}};
   ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds());
diff --git a/test/syscalls/linux/ip6tables.cc b/test/syscalls/linux/ip6tables.cc
new file mode 100644
index 000000000..e0e146067
--- /dev/null
+++ b/test/syscalls/linux/ip6tables.cc
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <sys/socket.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/iptables.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kNatTablename[] = "nat";
+constexpr char kErrorTarget[] = "ERROR";
+constexpr size_t kEmptyStandardEntrySize =
+    sizeof(struct ip6t_entry) + sizeof(struct xt_standard_target);
+constexpr size_t kEmptyErrorEntrySize =
+    sizeof(struct ip6t_entry) + sizeof(struct xt_error_target);
+
+TEST(IP6TablesBasic, FailSockoptNonRaw) {
+  // Even if the user has CAP_NET_RAW, they shouldn't be able to use the
+  // ip6tables sockopts with a non-raw socket.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  EXPECT_THAT(getsockopt(sock, SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(ENOPROTOOPT));
+
+  EXPECT_THAT(close(sock), SyscallSucceeds());
+}
+
+TEST(IP6TablesBasic, GetInfoErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info) - 1;
+  EXPECT_THAT(getsockopt(sock, SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IP6TablesBasic, GetEntriesErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ip6t_get_entries entries = {};
+  socklen_t entries_size = sizeof(struct ip6t_get_entries) - 1;
+  snprintf(entries.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IPV6, IP6T_SO_GET_ENTRIES, &entries, &entries_size),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IP6TablesBasic, GetRevision) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW),
+              SyscallSucceeds());
+
+  struct xt_get_revision rev = {};
+  socklen_t rev_len = sizeof(rev);
+
+  snprintf(rev.name, sizeof(rev.name), "REDIRECT");
+  rev.revision = 0;
+
+  // Revision 0 exists.
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IPV6, IP6T_SO_GET_REVISION_TARGET, &rev, &rev_len),
+      SyscallSucceeds());
+  EXPECT_EQ(rev.revision, 0);
+
+  // Revisions > 0 don't exist.
+  rev.revision = 1;
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IPV6, IP6T_SO_GET_REVISION_TARGET, &rev, &rev_len),
+      SyscallFailsWithErrno(EPROTONOSUPPORT));
+}
+
+// This tests the initial state of a machine with empty ip6tables via
+// getsockopt(IP6T_SO_GET_INFO). We don't have a guarantee that the iptables are
+// empty when running in native, but we can test that gVisor has the same
+// initial state that a newly-booted Linux machine would have.
+TEST(IP6TablesTest, InitialInfo) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_RAW, IPPROTO_RAW));
+
+  // Get info via sockopt.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  ASSERT_THAT(
+      getsockopt(sock.get(), SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+      SyscallSucceeds());
+
+  // The nat table supports PREROUTING, and OUTPUT.
+  unsigned int valid_hooks =
+      (1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT) |
+      (1 << NF_IP6_POST_ROUTING) | (1 << NF_IP6_LOCAL_IN);
+  EXPECT_EQ(info.valid_hooks, valid_hooks);
+
+  // Each chain consists of an empty entry with a standard target..
+  EXPECT_EQ(info.hook_entry[NF_IP6_PRE_ROUTING], 0);
+  EXPECT_EQ(info.hook_entry[NF_IP6_LOCAL_IN], kEmptyStandardEntrySize);
+  EXPECT_EQ(info.hook_entry[NF_IP6_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+  EXPECT_EQ(info.hook_entry[NF_IP6_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+  // The underflow points are the same as the entry points.
+  EXPECT_EQ(info.underflow[NF_IP6_PRE_ROUTING], 0);
+  EXPECT_EQ(info.underflow[NF_IP6_LOCAL_IN], kEmptyStandardEntrySize);
+  EXPECT_EQ(info.underflow[NF_IP6_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+  EXPECT_EQ(info.underflow[NF_IP6_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+  // One entry for each chain, plus an error entry at the end.
+  EXPECT_EQ(info.num_entries, 5);
+
+  EXPECT_EQ(info.size, 4 * kEmptyStandardEntrySize + kEmptyErrorEntrySize);
+  EXPECT_EQ(strcmp(info.name, kNatTablename), 0);
+}
+
+// This tests the initial state of a machine with empty ip6tables via
+// getsockopt(IP6T_SO_GET_ENTRIES). We don't have a guarantee that the iptables
+// are empty when running in native, but we can test that gVisor has the same
+// initial state that a newly-booted Linux machine would have.
+TEST(IP6TablesTest, InitialEntries) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_RAW, IPPROTO_RAW));
+
+  // Get info via sockopt.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info);
+  ASSERT_THAT(
+      getsockopt(sock.get(), SOL_IPV6, IP6T_SO_GET_INFO, &info, &info_size),
+      SyscallSucceeds());
+
+  // Use info to get entries.
+  socklen_t entries_size = sizeof(struct ip6t_get_entries) + info.size;
+  struct ip6t_get_entries* entries =
+      static_cast<struct ip6t_get_entries*>(malloc(entries_size));
+  snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  entries->size = info.size;
+  ASSERT_THAT(getsockopt(sock.get(), SOL_IPV6, IP6T_SO_GET_ENTRIES, entries,
+                         &entries_size),
+              SyscallSucceeds());
+
+  // Verify the name and size.
+  ASSERT_EQ(info.size, entries->size);
+  ASSERT_EQ(strcmp(entries->name, kNatTablename), 0);
+
+  // Verify that the entrytable is 4 entries with accept targets and no matches
+  // followed by a single error target.
+  size_t entry_offset = 0;
+  while (entry_offset < entries->size) {
+    struct ip6t_entry* entry = reinterpret_cast<struct ip6t_entry*>(
+        reinterpret_cast<char*>(entries->entrytable) + entry_offset);
+
+    // ipv6 should be zeroed.
+    struct ip6t_ip6 zeroed = {};
+    ASSERT_EQ(memcmp(static_cast<void*>(&zeroed),
+                     static_cast<void*>(&entry->ipv6), sizeof(zeroed)),
+              0);
+
+    // target_offset should be zero.
+    EXPECT_EQ(entry->target_offset, sizeof(ip6t_entry));
+
+    if (entry_offset < kEmptyStandardEntrySize * 4) {
+      // The first 4 entries are standard targets
+      struct xt_standard_target* target =
+          reinterpret_cast<struct xt_standard_target*>(entry->elems);
+      EXPECT_EQ(entry->next_offset, kEmptyStandardEntrySize);
+      EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+      EXPECT_EQ(strcmp(target->target.u.user.name, ""), 0);
+      EXPECT_EQ(target->target.u.user.revision, 0);
+      // This is what's returned for an accept verdict. I don't know why.
+      EXPECT_EQ(target->verdict, -NF_ACCEPT - 1);
+    } else {
+      // The last entry is an error target
+      struct xt_error_target* target =
+          reinterpret_cast<struct xt_error_target*>(entry->elems);
+      EXPECT_EQ(entry->next_offset, kEmptyErrorEntrySize);
+      EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+      EXPECT_EQ(strcmp(target->target.u.user.name, kErrorTarget), 0);
+      EXPECT_EQ(target->target.u.user.revision, 0);
+      EXPECT_EQ(strcmp(target->errorname, kErrorTarget), 0);
+    }
+
+    entry_offset += entry->next_offset;
+    break;
+  }
+
+  free(entries);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc
index b8e4ece64..22550b800 100644
--- a/test/syscalls/linux/iptables.cc
+++ b/test/syscalls/linux/iptables.cc
@@ -67,12 +67,82 @@ TEST(IPTablesBasic, FailSockoptNonRaw) {
   struct ipt_getinfo info = {};
   snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   socklen_t info_size = sizeof(info);
-  EXPECT_THAT(getsockopt(sock, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+  EXPECT_THAT(getsockopt(sock, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
               SyscallFailsWithErrno(ENOPROTOOPT));
 
   ASSERT_THAT(close(sock), SyscallSucceeds());
 }
 
+TEST(IPTablesBasic, GetInfoErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_getinfo info = {};
+  snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  socklen_t info_size = sizeof(info) - 1;
+  ASSERT_THAT(getsockopt(sock, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IPTablesBasic, GetEntriesErrorPrecedence) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+  // When using the wrong type of socket and a too-short optlen, we should get
+  // EINVAL.
+  struct ipt_get_entries entries = {};
+  socklen_t entries_size = sizeof(struct ipt_get_entries) - 1;
+  snprintf(entries.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+  ASSERT_THAT(
+      getsockopt(sock, SOL_IP, IPT_SO_GET_ENTRIES, &entries, &entries_size),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(IPTablesBasic, OriginalDstErrors) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_STREAM, 0), SyscallSucceeds());
+
+  // Sockets not affected by NAT should fail to find an original destination.
+  struct sockaddr_in addr = {};
+  socklen_t addr_len = sizeof(addr);
+  EXPECT_THAT(getsockopt(sock, SOL_IP, SO_ORIGINAL_DST, &addr, &addr_len),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST(IPTablesBasic, GetRevision) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int sock;
+  ASSERT_THAT(sock = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP),
+              SyscallSucceeds());
+
+  struct xt_get_revision rev = {};
+  socklen_t rev_len = sizeof(rev);
+
+  snprintf(rev.name, sizeof(rev.name), "REDIRECT");
+  rev.revision = 0;
+
+  // Revision 0 exists.
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IP, IPT_SO_GET_REVISION_TARGET, &rev, &rev_len),
+      SyscallSucceeds());
+  EXPECT_EQ(rev.revision, 0);
+
+  // Revisions > 0 don't exist.
+  rev.revision = 1;
+  EXPECT_THAT(
+      getsockopt(sock, SOL_IP, IPT_SO_GET_REVISION_TARGET, &rev, &rev_len),
+      SyscallFailsWithErrno(EPROTONOSUPPORT));
+}
+
 // Fixture for iptables tests.
 class IPTablesTest : public ::testing::Test {
  protected:
@@ -112,7 +182,7 @@ TEST_F(IPTablesTest, InitialState) {
   struct ipt_getinfo info = {};
   snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   socklen_t info_size = sizeof(info);
-  ASSERT_THAT(getsockopt(s_, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+  ASSERT_THAT(getsockopt(s_, SOL_IP, IPT_SO_GET_INFO, &info, &info_size),
               SyscallSucceeds());
 
   // The nat table supports PREROUTING, and OUTPUT.
@@ -148,7 +218,7 @@ TEST_F(IPTablesTest, InitialState) {
   snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
   entries->size = info.size;
   ASSERT_THAT(
-      getsockopt(s_, IPPROTO_IP, SO_GET_ENTRIES, entries, &entries_size),
+      getsockopt(s_, SOL_IP, IPT_SO_GET_ENTRIES, entries, &entries_size),
       SyscallSucceeds());
 
   // Verify the name and size.
diff --git a/test/syscalls/linux/iptables.h b/test/syscalls/linux/iptables.h
index 0719c60a4..d0fc10fea 100644
--- a/test/syscalls/linux/iptables.h
+++ b/test/syscalls/linux/iptables.h
@@ -27,27 +27,32 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/if.h>
 #include <netinet/ip.h>
 #include <stdint.h>
 
+//
+// IPv4 ABI.
+//
+
 #define ipt_standard_target xt_standard_target
 #define ipt_entry_target xt_entry_target
 #define ipt_error_target xt_error_target
 
 enum SockOpts {
   // For setsockopt.
-  BASE_CTL = 64,
-  SO_SET_REPLACE = BASE_CTL,
-  SO_SET_ADD_COUNTERS,
-  SO_SET_MAX = SO_SET_ADD_COUNTERS,
+  IPT_BASE_CTL = 64,
+  IPT_SO_SET_REPLACE = IPT_BASE_CTL,
+  IPT_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1,
+  IPT_SO_SET_MAX = IPT_SO_SET_ADD_COUNTERS,
 
   // For getsockopt.
-  SO_GET_INFO = BASE_CTL,
-  SO_GET_ENTRIES,
-  SO_GET_REVISION_MATCH,
-  SO_GET_REVISION_TARGET,
-  SO_GET_MAX = SO_GET_REVISION_TARGET
+  IPT_SO_GET_INFO = IPT_BASE_CTL,
+  IPT_SO_GET_ENTRIES = IPT_BASE_CTL + 1,
+  IPT_SO_GET_REVISION_MATCH = IPT_BASE_CTL + 2,
+  IPT_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 3,
+  IPT_SO_GET_MAX = IPT_SO_GET_REVISION_TARGET
 };
 
 // ipt_ip specifies basic matching criteria that can be applied by examining
@@ -115,7 +120,7 @@ struct ipt_entry {
   unsigned char elems[0];
 };
 
-// Passed to getsockopt(SO_GET_INFO).
+// Passed to getsockopt(IPT_SO_GET_INFO).
 struct ipt_getinfo {
   // The name of the table. The user only fills this in, the rest is filled in
   // when returning from getsockopt. Currently "nat" and "mangle" are supported.
@@ -127,7 +132,7 @@ struct ipt_getinfo {
   unsigned int valid_hooks;
 
   // The offset into the entry table for each valid hook. The entry table is
-  // returned by getsockopt(SO_GET_ENTRIES).
+  // returned by getsockopt(IPT_SO_GET_ENTRIES).
   unsigned int hook_entry[NF_IP_NUMHOOKS];
 
   // For each valid hook, the underflow is the offset into the entry table to
@@ -142,14 +147,14 @@ struct ipt_getinfo {
   unsigned int underflow[NF_IP_NUMHOOKS];
 
   // The number of entries in the entry table returned by
-  // getsockopt(SO_GET_ENTRIES).
+  // getsockopt(IPT_SO_GET_ENTRIES).
   unsigned int num_entries;
 
-  // The size of the entry table returned by getsockopt(SO_GET_ENTRIES).
+  // The size of the entry table returned by getsockopt(IPT_SO_GET_ENTRIES).
   unsigned int size;
 };
 
-// Passed to getsockopt(SO_GET_ENTRIES).
+// Passed to getsockopt(IPT_SO_GET_ENTRIES).
 struct ipt_get_entries {
   // The name of the table. The user fills this in. Currently "nat" and "mangle"
   // are supported.
@@ -195,4 +200,103 @@ struct ipt_replace {
   struct ipt_entry entries[0];
 };
 
+//
+// IPv6 ABI.
+//
+
+enum SockOpts6 {
+  // For setsockopt.
+  IP6T_BASE_CTL = 64,
+  IP6T_SO_SET_REPLACE = IP6T_BASE_CTL,
+  IP6T_SO_SET_ADD_COUNTERS = IP6T_BASE_CTL + 1,
+  IP6T_SO_SET_MAX = IP6T_SO_SET_ADD_COUNTERS,
+
+  // For getsockopt.
+  IP6T_SO_GET_INFO = IP6T_BASE_CTL,
+  IP6T_SO_GET_ENTRIES = IP6T_BASE_CTL + 1,
+  IP6T_SO_GET_REVISION_MATCH = IP6T_BASE_CTL + 4,
+  IP6T_SO_GET_REVISION_TARGET = IP6T_BASE_CTL + 5,
+  IP6T_SO_GET_MAX = IP6T_SO_GET_REVISION_TARGET
+};
+
+// ip6t_ip6 specifies basic matching criteria that can be applied by examining
+// only the IP header of a packet.
+struct ip6t_ip6 {
+  // Source IP address.
+  struct in6_addr src;
+
+  // Destination IP address.
+  struct in6_addr dst;
+
+  // Source IP address mask.
+  struct in6_addr smsk;
+
+  // Destination IP address mask.
+  struct in6_addr dmsk;
+
+  // Input interface.
+  char iniface[IFNAMSIZ];
+
+  // Output interface.
+  char outiface[IFNAMSIZ];
+
+  // Input interface mask.
+  unsigned char iniface_mask[IFNAMSIZ];
+
+  // Output interface mask.
+  unsigned char outiface_mask[IFNAMSIZ];
+
+  // Transport protocol.
+  uint16_t proto;
+
+  // TOS.
+  uint8_t tos;
+
+  // Flags.
+  uint8_t flags;
+
+  // Inverse flags.
+  uint8_t invflags;
+};
+
+// ip6t_entry is an ip6tables rule.
+struct ip6t_entry {
+  // Basic matching information used to match a packet's IP header.
+  struct ip6t_ip6 ipv6;
+
+  // A caching field that isn't used by userspace.
+  unsigned int nfcache;
+
+  // The number of bytes between the start of this entry and the rule's target.
+  uint16_t target_offset;
+
+  // The total size of this rule, from the beginning of the entry to the end of
+  // the target.
+  uint16_t next_offset;
+
+  // A return pointer not used by userspace.
+  unsigned int comefrom;
+
+  // Counters for packets and bytes, which we don't yet implement.
+  struct xt_counters counters;
+
+  // The data for all this rules matches followed by the target. This runs
+  // beyond the value of sizeof(struct ip6t_entry).
+  unsigned char elems[0];
+};
+
+// Passed to getsockopt(IP6T_SO_GET_ENTRIES).
+struct ip6t_get_entries {
+  // The name of the table.
+  char name[XT_TABLE_MAXNAMELEN];
+
+  // The size of the entry table in bytes. The user fills this in with the value
+  // from struct ipt_getinfo.size.
+  unsigned int size;
+
+  // The entries for the given table. This will run past the size defined by
+  // sizeof(struct ip6t_get_entries).
+  struct ip6t_entry entrytable[0];
+};
+
 #endif  // GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_
diff --git a/test/syscalls/linux/kcov.cc b/test/syscalls/linux/kcov.cc
new file mode 100644
index 000000000..6816c1fd0
--- /dev/null
+++ b/test/syscalls/linux/kcov.cc
@@ -0,0 +1,184 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// For this set of tests to run, they must be run with coverage enabled. On
+// native Linux, this involves compiling the kernel with kcov enabled. For
+// gVisor, we need to enable the Go coverage tool, e.g. bazel test --
+// collect_coverage_data --instrumentation_filter=//pkg/... <test>.
+
+constexpr char kcovPath[] = "/sys/kernel/debug/kcov";
+constexpr int kSize = 4096;
+constexpr int KCOV_INIT_TRACE = 0x80086301;
+constexpr int KCOV_ENABLE = 0x6364;
+constexpr int KCOV_DISABLE = 0x6365;
+
+uint64_t* KcovMmap(int fd) {
+  return (uint64_t*)mmap(nullptr, kSize * sizeof(uint64_t),
+                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+}
+
+TEST(KcovTest, Kcov) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  int fd;
+  ASSERT_THAT(fd = open(kcovPath, O_RDWR),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+  // Kcov not available.
+  SKIP_IF(errno == ENOENT);
+  auto fd_closer = Cleanup([fd]() { close(fd); });
+
+  ASSERT_THAT(ioctl(fd, KCOV_INIT_TRACE, kSize), SyscallSucceeds());
+  uint64_t* area = KcovMmap(fd);
+  ASSERT_TRUE(area != MAP_FAILED);
+  ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallSucceeds());
+
+  for (int i = 0; i < 10; i++) {
+    // Make some syscalls to generate coverage data.
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallFailsWithErrno(EINVAL));
+  }
+
+  uint64_t num_pcs = *(uint64_t*)(area);
+  EXPECT_GT(num_pcs, 0);
+  for (uint64_t i = 1; i <= num_pcs; i++) {
+    // Verify that PCs are in the standard kernel range.
+    EXPECT_GT(area[i], 0xffffffff7fffffffL);
+  }
+
+  ASSERT_THAT(ioctl(fd, KCOV_DISABLE, 0), SyscallSucceeds());
+}
+
+TEST(KcovTest, PrematureMmap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  int fd;
+  ASSERT_THAT(fd = open(kcovPath, O_RDWR),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+  // Kcov not available.
+  SKIP_IF(errno == ENOENT);
+  auto fd_closer = Cleanup([fd]() { close(fd); });
+
+  // Cannot mmap before KCOV_INIT_TRACE.
+  uint64_t* area = KcovMmap(fd);
+  ASSERT_TRUE(area == MAP_FAILED);
+}
+
+// Tests that multiple kcov fds can be used simultaneously.
+TEST(KcovTest, MultipleFds) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  int fd1;
+  ASSERT_THAT(fd1 = open(kcovPath, O_RDWR),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+  // Kcov not available.
+  SKIP_IF(errno == ENOENT);
+
+  int fd2;
+  ASSERT_THAT(fd2 = open(kcovPath, O_RDWR), SyscallSucceeds());
+  auto fd_closer = Cleanup([fd1, fd2]() {
+    close(fd1);
+    close(fd2);
+  });
+
+  auto t1 = ScopedThread([&] {
+    ASSERT_THAT(ioctl(fd1, KCOV_INIT_TRACE, kSize), SyscallSucceeds());
+    uint64_t* area = KcovMmap(fd1);
+    ASSERT_TRUE(area != MAP_FAILED);
+    ASSERT_THAT(ioctl(fd1, KCOV_ENABLE, 0), SyscallSucceeds());
+  });
+
+  ASSERT_THAT(ioctl(fd2, KCOV_INIT_TRACE, kSize), SyscallSucceeds());
+  uint64_t* area = KcovMmap(fd2);
+  ASSERT_TRUE(area != MAP_FAILED);
+  ASSERT_THAT(ioctl(fd2, KCOV_ENABLE, 0), SyscallSucceeds());
+}
+
+// Tests behavior for two threads trying to use the same kcov fd.
+TEST(KcovTest, MultipleThreads) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  int fd;
+  ASSERT_THAT(fd = open(kcovPath, O_RDWR),
+              AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+  // Kcov not available.
+  SKIP_IF(errno == ENOENT);
+  auto fd_closer = Cleanup([fd]() { close(fd); });
+
+  // Test the behavior of multiple threads trying to use the same kcov fd
+  // simultaneously.
+  std::atomic<bool> t1_enabled(false), t1_disabled(false), t2_failed(false),
+      t2_exited(false);
+  auto t1 = ScopedThread([&] {
+    ASSERT_THAT(ioctl(fd, KCOV_INIT_TRACE, kSize), SyscallSucceeds());
+    uint64_t* area = KcovMmap(fd);
+    ASSERT_TRUE(area != MAP_FAILED);
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallSucceeds());
+    t1_enabled = true;
+
+    // After t2 has made sure that enabling kcov again fails, disable it.
+    while (!t2_failed) {
+      sched_yield();
+    }
+    ASSERT_THAT(ioctl(fd, KCOV_DISABLE, 0), SyscallSucceeds());
+    t1_disabled = true;
+
+    // Wait for t2 to enable kcov and then exit, after which we should be able
+    // to enable kcov again, without needing to set up a new memory mapping.
+    while (!t2_exited) {
+      sched_yield();
+    }
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallSucceeds());
+  });
+
+  auto t2 = ScopedThread([&] {
+    // Wait for t1 to enable kcov, and make sure that enabling kcov again fails.
+    while (!t1_enabled) {
+      sched_yield();
+    }
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallFailsWithErrno(EINVAL));
+    t2_failed = true;
+
+    // Wait for t1 to disable kcov, after which using fd should now succeed.
+    while (!t1_disabled) {
+      sched_yield();
+    }
+    uint64_t* area = KcovMmap(fd);
+    ASSERT_TRUE(area != MAP_FAILED);
+    ASSERT_THAT(ioctl(fd, KCOV_ENABLE, 0), SyscallSucceeds());
+  });
+
+  t2.Join();
+  t2_exited = true;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/membarrier.cc b/test/syscalls/linux/membarrier.cc
new file mode 100644
index 000000000..516956a25
--- /dev/null
+++ b/test/syscalls/linux/membarrier.cc
@@ -0,0 +1,268 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// This is the classic test case for memory fences on architectures with total
+// store ordering; see e.g. Intel SDM Vol. 3A Sec. 8.2.3.4 "Loads May Be
+// Reordered with Earlier Stores to Different Locations". In each iteration of
+// the test, given two variables X and Y initially set to 0
+// (MembarrierTestSharedState::local_var and remote_var in the code), two
+// threads execute as follows:
+//
+// T1                                   T2
+// --                                   --
+//
+// X = 1                                Y = 1
+// T1fence()                            T2fence()
+// read Y                               read X
+//
+// On architectures where memory writes may be locally buffered by each CPU
+// (essentially all architectures), if T1fence() and T2fence() are omitted or
+// ineffective, it is possible for both T1 and T2 to read 0 because the memory
+// write from the other CPU is not yet visible outside that CPU. T1fence() and
+// T2fence() are expected to perform the necessary synchronization to restore
+// sequential consistency: both threads agree on a order of memory accesses that
+// is consistent with program order in each thread, such that at least one
+// thread reads 1.
+//
+// In the NoMembarrier test, T1fence() and T2fence() are both ordinary memory
+// fences establishing ordering between memory accesses before and after the
+// fence (std::atomic_thread_fence). In all other test cases, T1fence() is not a
+// memory fence at all, but only prevents compiler reordering of memory accesses
+// (std::atomic_signal_fence); T2fence() is an invocation of the membarrier()
+// syscall, which establishes ordering of memory accesses before and after the
+// syscall on both threads.
+
+template <typename F>
+int DoMembarrierTestSide(std::atomic<int>* our_var,
+                         std::atomic<int> const& their_var,
+                         F const& test_fence) {
+  our_var->store(1, std::memory_order_relaxed);
+  test_fence();
+  return their_var.load(std::memory_order_relaxed);
+}
+
+struct MembarrierTestSharedState {
+  std::atomic<int64_t> remote_iter_cur;
+  std::atomic<int64_t> remote_iter_done;
+  std::atomic<int> local_var;
+  std::atomic<int> remote_var;
+  int remote_obs_of_local_var;
+
+  void Init() {
+    remote_iter_cur.store(-1, std::memory_order_relaxed);
+    remote_iter_done.store(-1, std::memory_order_relaxed);
+  }
+};
+
+// Special value for MembarrierTestSharedState::remote_iter_cur indicating that
+// the remote thread should terminate.
+constexpr int64_t kRemoteIterStop = -2;
+
+// Must be async-signal-safe.
+template <typename F>
+void RunMembarrierTestRemoteSide(MembarrierTestSharedState* state,
+                                 F const& test_fence) {
+  int64_t i = 0;
+  int64_t cur;
+  while (true) {
+    while ((cur = state->remote_iter_cur.load(std::memory_order_acquire)) < i) {
+      if (cur == kRemoteIterStop) {
+        return;
+      }
+      // spin
+    }
+    state->remote_obs_of_local_var =
+        DoMembarrierTestSide(&state->remote_var, state->local_var, test_fence);
+    state->remote_iter_done.store(i, std::memory_order_release);
+    i++;
+  }
+}
+
+template <typename F>
+void RunMembarrierTestLocalSide(MembarrierTestSharedState* state,
+                                F const& test_fence) {
+  // On test completion, instruct the remote thread to terminate.
+  Cleanup cleanup_remote([&] {
+    state->remote_iter_cur.store(kRemoteIterStop, std::memory_order_relaxed);
+  });
+
+  int64_t i = 0;
+  absl::Time end = absl::Now() + absl::Seconds(5);  // arbitrary test duration
+  while (absl::Now() < end) {
+    // Reset both vars to 0.
+    state->local_var.store(0, std::memory_order_relaxed);
+    state->remote_var.store(0, std::memory_order_relaxed);
+    // Instruct the remote thread to begin this iteration.
+    state->remote_iter_cur.store(i, std::memory_order_release);
+    // Perform our side of the test.
+    auto local_obs_of_remote_var =
+        DoMembarrierTestSide(&state->local_var, state->remote_var, test_fence);
+    // Wait for the remote thread to finish this iteration.
+    while (state->remote_iter_done.load(std::memory_order_acquire) < i) {
+      // spin
+    }
+    ASSERT_TRUE(local_obs_of_remote_var != 0 ||
+                state->remote_obs_of_local_var != 0);
+    i++;
+  }
+}
+
+TEST(MembarrierTest, NoMembarrier) {
+  MembarrierTestSharedState state;
+  state.Init();
+
+  ScopedThread remote_thread([&] {
+    RunMembarrierTestRemoteSide(
+        &state, [] { std::atomic_thread_fence(std::memory_order_seq_cst); });
+  });
+  RunMembarrierTestLocalSide(
+      &state, [] { std::atomic_thread_fence(std::memory_order_seq_cst); });
+}
+
+enum membarrier_cmd {
+  MEMBARRIER_CMD_QUERY = 0,
+  MEMBARRIER_CMD_GLOBAL = (1 << 0),
+  MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
+  MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
+  MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
+  MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
+};
+
+int membarrier(membarrier_cmd cmd, int flags) {
+  return syscall(SYS_membarrier, cmd, flags);
+}
+
+PosixErrorOr<int> SupportedMembarrierCommands() {
+  int cmds = membarrier(MEMBARRIER_CMD_QUERY, 0);
+  if (cmds < 0) {
+    if (errno == ENOSYS) {
+      // No commands are supported.
+      return 0;
+    }
+    return PosixError(errno, "membarrier(MEMBARRIER_CMD_QUERY) failed");
+  }
+  return cmds;
+}
+
+TEST(MembarrierTest, Global) {
+  SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
+           MEMBARRIER_CMD_GLOBAL) == 0);
+
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+  auto state = static_cast<MembarrierTestSharedState*>(m.ptr());
+  state->Init();
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    RunMembarrierTestRemoteSide(
+        state, [] { TEST_PCHECK(membarrier(MEMBARRIER_CMD_GLOBAL, 0) == 0); });
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  Cleanup cleanup_child([&] {
+    int status;
+    ASSERT_THAT(waitpid(child_pid, &status, 0),
+                SyscallSucceedsWithValue(child_pid));
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+        << " status " << status;
+  });
+  RunMembarrierTestLocalSide(
+      state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
+}
+
+TEST(MembarrierTest, GlobalExpedited) {
+  constexpr int kRequiredCommands = MEMBARRIER_CMD_GLOBAL_EXPEDITED |
+                                    MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED;
+  SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
+           kRequiredCommands) != kRequiredCommands);
+
+  ASSERT_THAT(membarrier(MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, 0),
+              SyscallSucceeds());
+
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+  auto state = static_cast<MembarrierTestSharedState*>(m.ptr());
+  state->Init();
+
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    RunMembarrierTestRemoteSide(state, [] {
+      TEST_PCHECK(membarrier(MEMBARRIER_CMD_GLOBAL_EXPEDITED, 0) == 0);
+    });
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+  Cleanup cleanup_child([&] {
+    int status;
+    ASSERT_THAT(waitpid(child_pid, &status, 0),
+                SyscallSucceedsWithValue(child_pid));
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+        << " status " << status;
+  });
+  RunMembarrierTestLocalSide(
+      state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
+}
+
+TEST(MembarrierTest, PrivateExpedited) {
+  constexpr int kRequiredCommands = MEMBARRIER_CMD_PRIVATE_EXPEDITED |
+                                    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED;
+  SKIP_IF((ASSERT_NO_ERRNO_AND_VALUE(SupportedMembarrierCommands()) &
+           kRequiredCommands) != kRequiredCommands);
+
+  ASSERT_THAT(membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0),
+              SyscallSucceeds());
+
+  MembarrierTestSharedState state;
+  state.Init();
+
+  ScopedThread remote_thread([&] {
+    RunMembarrierTestRemoteSide(&state, [] {
+      TEST_PCHECK(membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0) == 0);
+    });
+  });
+  RunMembarrierTestLocalSide(
+      &state, [] { std::atomic_signal_fence(std::memory_order_seq_cst); });
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index f8b7f7938..4a450742b 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -14,12 +14,10 @@
 
 #include <errno.h>
 #include <fcntl.h>
-#include <linux/magic.h>
 #include <linux/memfd.h>
 #include <linux/unistd.h>
 #include <string.h>
 #include <sys/mman.h>
-#include <sys/statfs.h>
 #include <sys/syscall.h>
 
 #include <vector>
@@ -53,6 +51,7 @@ namespace {
 #define F_SEAL_GROW 0x0004
 #define F_SEAL_WRITE 0x0008
 
+using ::gvisor::testing::IsTmpfs;
 using ::testing::StartsWith;
 
 const std::string kMemfdName = "some-memfd";
@@ -444,20 +443,6 @@ TEST(MemfdTest, SealsAreInodeLevelProperties) {
   EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM));
 }
 
-PosixErrorOr<bool> IsTmpfs(const std::string& path) {
-  struct statfs stat;
-  if (statfs(path.c_str(), &stat)) {
-    if (errno == ENOENT) {
-      // Nothing at path, don't raise this as an error. Instead, just report no
-      // tmpfs at path.
-      return false;
-    }
-    return PosixError(errno,
-                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
-  }
-  return stat.f_type == TMPFS_MAGIC;
-}
-
 // Tmpfs files also support seals, but are created with F_SEAL_SEAL.
 TEST(MemfdTest, TmpfsFilesHaveSealSeal) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp")));
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 4036a9275..27758203d 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -82,6 +82,13 @@ TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
               SyscallFailsWithErrno(EACCES));
 }
 
+TEST_F(MkdirTest, MkdirAtEmptyPath) {
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirname_, O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(mkdirat(fd.get(), "", 0777), SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 05dfb375a..1635c6d0c 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/un.h>
@@ -92,15 +93,46 @@ TEST(MknodTest, MknodOnExistingPathFails) {
 }
 
 TEST(MknodTest, UnimplementedTypesReturnError) {
+  // TODO(gvisor.dev/issue/1624): These file types are supported by some
+  // filesystems in VFS2, so this test should be deleted along with VFS1.
+  SKIP_IF(!IsRunningWithVFS1());
+
   const std::string path = NewTempAbsPath();
+  EXPECT_THAT(mknod(path.c_str(), S_IFSOCK, 0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(mknod(path.c_str(), S_IFCHR, 0), SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(mknod(path.c_str(), S_IFBLK, 0), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MknodTest, Socket) {
+  SKIP_IF(IsRunningOnGvisor() && IsRunningWithVFS1());
+
+  ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+  auto filename = NewTempRelPath();
+
+  ASSERT_THAT(mknod(filename.c_str(), S_IFSOCK | S_IRUSR | S_IWUSR, 0),
+              SyscallSucceeds());
+
+  int sk;
+  ASSERT_THAT(sk = socket(AF_UNIX, SOCK_SEQPACKET, 0), SyscallSucceeds());
+  FileDescriptor fd(sk);
 
-  if (IsRunningWithVFS1()) {
-    ASSERT_THAT(mknod(path.c_str(), S_IFSOCK, 0),
-                SyscallFailsWithErrno(EOPNOTSUPP));
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  absl::SNPrintF(addr.sun_path, sizeof(addr.sun_path), "%s", filename.c_str());
+  ASSERT_THAT(connect(sk, (struct sockaddr *)&addr, sizeof(addr)),
+              SyscallFailsWithErrno(ECONNREFUSED));
+  ASSERT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+PosixErrorOr<FileDescriptor> OpenRetryEINTR(std::string const& path, int flags,
+                                            mode_t mode = 0) {
+  while (true) {
+    auto maybe_fd = Open(path, flags, mode);
+    if (maybe_fd.ok() || maybe_fd.error().errno_value() != EINTR) {
+      return maybe_fd;
+    }
   }
-  // These will fail on linux as well since we don't have CAP_MKNOD.
-  ASSERT_THAT(mknod(path.c_str(), S_IFCHR, 0), SyscallFailsWithErrno(EPERM));
-  ASSERT_THAT(mknod(path.c_str(), S_IFBLK, 0), SyscallFailsWithErrno(EPERM));
 }
 
 TEST(MknodTest, Fifo) {
@@ -117,14 +149,16 @@ TEST(MknodTest, Fifo) {
 
   // Read-end of the pipe.
   ScopedThread t([&fifo, &buf, &msg]() {
-    FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(OpenRetryEINTR(fifo.c_str(), O_RDONLY));
     EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
                 SyscallSucceedsWithValue(msg.length()));
     EXPECT_EQ(msg, std::string(buf.data()));
   });
 
   // Write-end of the pipe.
-  FileDescriptor wfd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY));
+  FileDescriptor wfd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenRetryEINTR(fifo.c_str(), O_WRONLY));
   EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
               SyscallSucceedsWithValue(msg.length()));
 }
@@ -142,15 +176,16 @@ TEST(MknodTest, FifoOtrunc) {
   std::vector<char> buf(512);
   // Read-end of the pipe.
   ScopedThread t([&fifo, &buf, &msg]() {
-    FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(OpenRetryEINTR(fifo.c_str(), O_RDONLY));
     EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
                 SyscallSucceedsWithValue(msg.length()));
     EXPECT_EQ(msg, std::string(buf.data()));
   });
 
   // Write-end of the pipe.
-  FileDescriptor wfd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY | O_TRUNC));
+  FileDescriptor wfd = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenRetryEINTR(fifo.c_str(), O_WRONLY | O_TRUNC));
   EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
               SyscallSucceedsWithValue(msg.length()));
 }
@@ -170,20 +205,29 @@ TEST(MknodTest, FifoTruncNoOp) {
   std::vector<char> buf(512);
   // Read-end of the pipe.
   ScopedThread t([&fifo, &buf, &msg]() {
-    FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(OpenRetryEINTR(fifo.c_str(), O_RDONLY));
     EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
                 SyscallSucceedsWithValue(msg.length()));
     EXPECT_EQ(msg, std::string(buf.data()));
   });
 
-  FileDescriptor wfd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY | O_TRUNC));
+  FileDescriptor wfd = ASSERT_NO_ERRNO_AND_VALUE(
+      OpenRetryEINTR(fifo.c_str(), O_WRONLY | O_TRUNC));
   EXPECT_THAT(ftruncate(wfd.get(), 0), SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
               SyscallSucceedsWithValue(msg.length()));
   EXPECT_THAT(ftruncate(wfd.get(), 0), SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(MknodTest, MknodAtEmptyPath) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(mknodat(fd.get(), "", S_IFREG | 0777, 0),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 46b6f38db..d65b7d031 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -34,6 +34,7 @@
 #include "test/util/mount_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -131,7 +132,9 @@ TEST(MountTest, UmountDetach) {
       ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "mode=0700",
                                       /* umountflags= */ MNT_DETACH));
   const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
-  EXPECT_NE(before.st_ino, after.st_ino);
+  EXPECT_FALSE(before.st_dev == after.st_dev && before.st_ino == after.st_ino)
+      << "mount point has device number " << before.st_dev
+      << " and inode number " << before.st_ino << " before and after mount";
 
   // Create files in the new mount.
   constexpr char kContents[] = "no no no";
@@ -147,8 +150,17 @@ TEST(MountTest, UmountDetach) {
   // Unmount the tmpfs.
   mount.Release()();
 
-  const struct stat after2 = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
-  EXPECT_EQ(before.st_ino, after2.st_ino);
+  // Inode numbers for gofer-accessed files may change across save/restore.
+  //
+  // For overlayfs, if xino option is not enabled and if all overlayfs layers do
+  // not belong to the same filesystem then "the value of st_ino for directory
+  // objects may not be persistent and could change even while the overlay
+  // filesystem is mounted."  -- Documentation/filesystems/overlayfs.txt
+  if (!IsRunningWithSaveRestore() &&
+      !ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(dir.path()))) {
+    const struct stat after2 = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+    EXPECT_EQ(before.st_ino, after2.st_ino);
+  }
 
   // Can still read file after unmounting.
   std::vector<char> buf(sizeof(kContents));
@@ -207,14 +219,26 @@ TEST(MountTest, MountTmpfs) {
 
     const struct stat s = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
     EXPECT_EQ(s.st_mode, S_IFDIR | 0700);
-    EXPECT_NE(s.st_ino, before.st_ino);
+    EXPECT_FALSE(before.st_dev == s.st_dev && before.st_ino == s.st_ino)
+        << "mount point has device number " << before.st_dev
+        << " and inode number " << before.st_ino << " before and after mount";
 
     EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
   }
 
   // Now that dir is unmounted again, we should have the old inode back.
-  const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
-  EXPECT_EQ(before.st_ino, after.st_ino);
+  //
+  // Inode numbers for gofer-accessed files may change across save/restore.
+  //
+  // For overlayfs, if xino option is not enabled and if all overlayfs layers do
+  // not belong to the same filesystem then "the value of st_ino for directory
+  // objects may not be persistent and could change even while the overlay
+  // filesystem is mounted."  -- Documentation/filesystems/overlayfs.txt
+  if (!IsRunningWithSaveRestore() &&
+      !ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(dir.path()))) {
+    const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+    EXPECT_EQ(before.st_ino, after.st_ino);
+  }
 }
 
 TEST(MountTest, MountTmpfsMagicValIgnored) {
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 51eacf3f2..78c36f98f 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -88,21 +88,21 @@ TEST(CreateTest, CreateExclusively) {
               SyscallFailsWithErrno(EEXIST));
 }
 
-TEST(CreateTeast, CreatWithOTrunc) {
+TEST(CreateTest, CreatWithOTrunc) {
   std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
   ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
   ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC, 0666),
               SyscallFailsWithErrno(EISDIR));
 }
 
-TEST(CreateTeast, CreatDirWithOTruncAndReadOnly) {
+TEST(CreateTest, CreatDirWithOTruncAndReadOnly) {
   std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
   ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
   ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
               SyscallFailsWithErrno(EISDIR));
 }
 
-TEST(CreateTeast, CreatFileWithOTruncAndReadOnly) {
+TEST(CreateTest, CreatFileWithOTruncAndReadOnly) {
   std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
   int dirfd;
   ASSERT_THAT(dirfd = open(dirpath.c_str(), O_RDWR | O_CREAT, 0666),
@@ -149,6 +149,116 @@ TEST(CreateTest, OpenCreateROThenRW) {
   EXPECT_THAT(WriteFd(fd2.get(), &c, 1), SyscallSucceedsWithValue(1));
 }
 
+TEST(CreateTest, ChmodReadToWriteBetweenOpens_NoRandomSave) {
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // override file read/write permissions. CAP_DAC_READ_SEARCH needs to be
+  // cleared for the same reason.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0400));
+
+  const FileDescriptor rfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  // Cannot restore after making permissions more restrictive.
+  const DisableSave ds;
+  ASSERT_THAT(fchmod(rfd.get(), 0200), SyscallSucceeds());
+
+  EXPECT_THAT(open(file.path().c_str(), O_RDONLY),
+              SyscallFailsWithErrno(EACCES));
+
+  const FileDescriptor wfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+
+  char c = 'x';
+  EXPECT_THAT(write(wfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  c = 0;
+  EXPECT_THAT(read(rfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(c, 'x');
+}
+
+TEST(CreateTest, ChmodWriteToReadBetweenOpens_NoRandomSave) {
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // override file read/write permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0200));
+
+  const FileDescriptor wfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+
+  // Cannot restore after making permissions more restrictive.
+  const DisableSave ds;
+  ASSERT_THAT(fchmod(wfd.get(), 0400), SyscallSucceeds());
+
+  EXPECT_THAT(open(file.path().c_str(), O_WRONLY),
+              SyscallFailsWithErrno(EACCES));
+
+  const FileDescriptor rfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  char c = 'x';
+  EXPECT_THAT(write(wfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  c = 0;
+  EXPECT_THAT(read(rfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(c, 'x');
+}
+
+TEST(CreateTest, CreateWithReadFlagNotAllowedByMode_NoRandomSave) {
+  // The only time we can open a file with flags forbidden by its permissions
+  // is when we are creating the file. We cannot re-open with the same flags,
+  // so we cannot restore an fd obtained from such an operation.
+  const DisableSave ds;
+
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // override file read/write permissions. CAP_DAC_READ_SEARCH needs to be
+  // cleared for the same reason.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  // Create and open a file with read flag but without read permissions.
+  const std::string path = NewTempAbsPath();
+  const FileDescriptor rfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CREAT | O_RDONLY, 0222));
+
+  EXPECT_THAT(open(path.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+  const FileDescriptor wfd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_WRONLY));
+
+  char c = 'x';
+  EXPECT_THAT(write(wfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  c = 0;
+  EXPECT_THAT(read(rfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(c, 'x');
+}
+
+TEST(CreateTest, CreateWithWriteFlagNotAllowedByMode_NoRandomSave) {
+  // The only time we can open a file with flags forbidden by its permissions
+  // is when we are creating the file. We cannot re-open with the same flags,
+  // so we cannot restore an fd obtained from such an operation.
+  const DisableSave ds;
+
+  // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+  // override file read/write permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  // Create and open a file with write flag but without write permissions.
+  const std::string path = NewTempAbsPath();
+  const FileDescriptor wfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CREAT | O_WRONLY, 0444));
+
+  EXPECT_THAT(open(path.c_str(), O_WRONLY), SyscallFailsWithErrno(EACCES));
+  const FileDescriptor rfd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_RDONLY));
+
+  char c = 'x';
+  EXPECT_THAT(write(wfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  c = 0;
+  EXPECT_THAT(read(rfd.get(), &c, 1), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(c, 'x');
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
index a11a03415..a7c46adbf 100644
--- a/test/syscalls/linux/packet_socket_raw.cc
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -14,9 +14,7 @@
 
 #include <arpa/inet.h>
 #include <linux/capability.h>
-#ifndef __fuchsia__
 #include <linux/filter.h>
-#endif  // __fuchsia__
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
 #include <net/ethernet.h>
@@ -618,8 +616,6 @@ TEST_P(RawPacketTest, GetSocketErrorBind) {
   }
 }
 
-#ifndef __fuchsia__
-
 TEST_P(RawPacketTest, SetSocketDetachFilterNoInstalledFilter) {
   // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
   //
@@ -647,8 +643,38 @@ TEST_P(RawPacketTest, GetSocketDetachFilter) {
               SyscallFailsWithErrno(ENOPROTOOPT));
 }
 
-#endif  // __fuchsia__
+TEST_P(RawPacketTest, SetAndGetSocketLinger) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int level = SOL_SOCKET;
+  int type = SO_LINGER;
 
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)),
+              SyscallSucceedsWithValue(0));
+
+  struct linger got_linger = {};
+  socklen_t length = sizeof(sl);
+  ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+}
+
+TEST_P(RawPacketTest, GetSocketAcceptConn) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
 INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 34291850d..06d9dbf65 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <fcntl.h> /* Obtain O_* constant definitions */
+#include <linux/magic.h>
 #include <sys/ioctl.h>
+#include <sys/statfs.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
@@ -198,6 +200,16 @@ TEST_P(PipeTest, NonBlocking) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
+TEST(PipeTest, StatFS) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  struct statfs st;
+  EXPECT_THAT(fstatfs(fds[0], &st), SyscallSucceeds());
+  EXPECT_EQ(st.f_type, PIPEFS_MAGIC);
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 TEST(Pipe2Test, CloExec) {
   int fds[2];
   ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
@@ -557,30 +569,38 @@ TEST_P(PipeTest, Streaming) {
 
   // Size() requires 2 syscalls, call it once and remember the value.
   const int pipe_size = Size();
+  const size_t streamed_bytes = 4 * pipe_size;
 
   absl::Notification notify;
-  ScopedThread t([this, &notify, pipe_size]() {
+  ScopedThread t([&, this]() {
+    std::vector<char> buf(1024);
     // Don't start until it's full.
     notify.WaitForNotification();
-    for (int i = 0; i < pipe_size; i++) {
-      int rbuf;
-      ASSERT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
-                  SyscallSucceedsWithValue(sizeof(rbuf)));
-      EXPECT_EQ(rbuf, i);
+    ssize_t total = 0;
+    while (total < streamed_bytes) {
+      ASSERT_THAT(read(rfd_.get(), buf.data(), buf.size()),
+                  SyscallSucceedsWithValue(buf.size()));
+      total += buf.size();
     }
   });
 
   // Write 4 bytes * pipe_size. It will fill up the pipe once, notify the reader
   // to start. Then we write pipe size worth 3 more times to ensure the reader
   // can follow along.
+  //
+  // The size of each write (which is determined by buf.size()) must be smaller
+  // than the size of the pipe (which, in the "smallbuffer" configuration, is 1
+  // page) for the check for notify.Notify() below to be correct.
+  std::vector<char> buf(1024);
+  RandomizeBuffer(buf.data(), buf.size());
   ssize_t total = 0;
-  for (int i = 0; i < pipe_size; i++) {
-    ssize_t written = write(wfd_.get(), &i, sizeof(i));
-    ASSERT_THAT(written, SyscallSucceedsWithValue(sizeof(i)));
-    total += written;
+  while (total < streamed_bytes) {
+    ASSERT_THAT(write(wfd_.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(buf.size()));
+    total += buf.size();
 
     // Is the next write about to fill up the buffer? Wake up the reader once.
-    if (total < pipe_size && (total + written) >= pipe_size) {
+    if (total < pipe_size && (total + buf.size()) >= pipe_size) {
       notify.Notify();
     }
   }
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index 04c5161f5..f675dc430 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -153,7 +153,7 @@ TEST(PrctlTest, PDeathSig) {
       // Enable tracing, then raise SIGSTOP and expect our parent to suppress
       // it.
       TEST_CHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) >= 0);
-      raise(SIGSTOP);
+      TEST_CHECK(raise(SIGSTOP) == 0);
       // Sleep until killed by our parent death signal. sleep(3) is
       // async-signal-safe, absl::SleepFor isn't.
       while (true) {
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index d6b875dbf..e8fcc4439 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -16,6 +16,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
+#include <linux/magic.h>
 #include <sched.h>
 #include <signal.h>
 #include <stddef.h>
@@ -26,6 +27,7 @@
 #include <sys/mman.h>
 #include <sys/prctl.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/utsname.h>
 #include <syscall.h>
 #include <unistd.h>
@@ -45,6 +47,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/node_hash_set.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
@@ -61,6 +64,7 @@
 #include "test/util/fs_util.h"
 #include "test/util/memory_util.h"
 #include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -670,6 +674,23 @@ TEST(ProcSelfMaps, Mprotect) {
                                                    3 * kPageSize, PROT_READ)));
 }
 
+TEST(ProcSelfMaps, SharedAnon) {
+  const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ, MAP_SHARED | MAP_ANONYMOUS));
+
+  const auto proc_self_maps =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+  for (const auto& line : absl::StrSplit(proc_self_maps, '\n')) {
+    const auto entry = ASSERT_NO_ERRNO_AND_VALUE(ParseProcMapsLine(line));
+    if (entry.start <= m.addr() && m.addr() < entry.end) {
+      // cf. proc(5), "/proc/[pid]/map_files/"
+      EXPECT_EQ(entry.filename, "/dev/zero (deleted)");
+      return;
+    }
+  }
+  FAIL() << "no maps entry containing mapping at " << m.ptr();
+}
+
 TEST(ProcSelfFd, OpenFd) {
   int pipe_fds[2];
   ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
@@ -692,6 +713,30 @@ TEST(ProcSelfFd, OpenFd) {
   ASSERT_THAT(close(pipe_fds[1]), SyscallSucceeds());
 }
 
+static void CheckFdDirGetdentsDuplicates(const std::string& path) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path.c_str(), O_RDONLY | O_DIRECTORY));
+  // Open a FD whose value is supposed to be much larger than
+  // the number of FDs opened by current process.
+  auto newfd = fcntl(fd.get(), F_DUPFD, 1024);
+  EXPECT_GE(newfd, 1024);
+  auto fd_closer = Cleanup([newfd]() { close(newfd); });
+  auto fd_files = ASSERT_NO_ERRNO_AND_VALUE(ListDir(path.c_str(), false));
+  absl::node_hash_set<std::string> fd_files_dedup(fd_files.begin(),
+                                                  fd_files.end());
+  EXPECT_EQ(fd_files.size(), fd_files_dedup.size());
+}
+
+// This is a regression test for gvisor.dev/issues/3894
+TEST(ProcSelfFd, GetdentsDuplicates) {
+  CheckFdDirGetdentsDuplicates("/proc/self/fd");
+}
+
+// This is a regression test for gvisor.dev/issues/3894
+TEST(ProcSelfFdInfo, GetdentsDuplicates) {
+  CheckFdDirGetdentsDuplicates("/proc/self/fdinfo");
+}
+
 TEST(ProcSelfFdInfo, CorrectFds) {
   // Make sure there is at least one open file.
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
@@ -735,8 +780,12 @@ TEST(ProcSelfFdInfo, Flags) {
 }
 
 TEST(ProcSelfExe, Absolute) {
-  auto exe = ASSERT_NO_ERRNO_AND_VALUE(
-      ReadLink(absl::StrCat("/proc/", getpid(), "/exe")));
+  auto exe = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+  EXPECT_EQ(exe[0], '/');
+}
+
+TEST(ProcSelfCwd, Absolute) {
+  auto exe = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/cwd"));
   EXPECT_EQ(exe[0], '/');
 }
 
@@ -771,17 +820,12 @@ TEST(ProcCpuinfo, DeniesWriteNonRoot) {
     constexpr int kNobody = 65534;
     EXPECT_THAT(syscall(SYS_setuid, kNobody), SyscallSucceeds());
     EXPECT_THAT(open("/proc/cpuinfo", O_WRONLY), SyscallFailsWithErrno(EACCES));
-    // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in
-    // kernfs.
-    if (!IsRunningOnGvisor() || IsRunningWithVFS1()) {
-      EXPECT_THAT(truncate("/proc/cpuinfo", 123),
-                  SyscallFailsWithErrno(EACCES));
-    }
+    EXPECT_THAT(truncate("/proc/cpuinfo", 123), SyscallFailsWithErrno(EACCES));
   });
 }
 
 // With root privileges, it is possible to open /proc/cpuinfo with write mode,
-// but all write operations will return EIO.
+// but all write operations should fail.
 TEST(ProcCpuinfo, DeniesWriteRoot) {
   // VFS1 does not behave differently for root/non-root.
   SKIP_IF(IsRunningWithVFS1());
@@ -790,16 +834,10 @@ TEST(ProcCpuinfo, DeniesWriteRoot) {
   int fd;
   EXPECT_THAT(fd = open("/proc/cpuinfo", O_WRONLY), SyscallSucceeds());
   if (fd > 0) {
-    EXPECT_THAT(write(fd, "x", 1), SyscallFailsWithErrno(EIO));
-    EXPECT_THAT(pwrite(fd, "x", 1, 123), SyscallFailsWithErrno(EIO));
-  }
-  // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in
-  // kernfs.
-  if (!IsRunningOnGvisor() || IsRunningWithVFS1()) {
-    if (fd > 0) {
-      EXPECT_THAT(ftruncate(fd, 123), SyscallFailsWithErrno(EIO));
-    }
-    EXPECT_THAT(truncate("/proc/cpuinfo", 123), SyscallFailsWithErrno(EIO));
+    // Truncate is not tested--it may succeed on some kernels without doing
+    // anything.
+    EXPECT_THAT(write(fd, "x", 1), SyscallFails());
+    EXPECT_THAT(pwrite(fd, "x", 1, 123), SyscallFails());
   }
 }
 
@@ -1439,6 +1477,16 @@ TEST(ProcPidExe, Subprocess) {
   EXPECT_EQ(actual, expected_absolute_path);
 }
 
+// /proc/PID/cwd points to the correct directory.
+TEST(ProcPidCwd, Subprocess) {
+  auto want = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+
+  char got[PATH_MAX + 1] = {};
+  ASSERT_THAT(ReadlinkWhileRunning("cwd", got, sizeof(got)),
+              SyscallSucceedsWithValue(Gt(0)));
+  EXPECT_EQ(got, want);
+}
+
 // Test whether /proc/PID/ files can be read for a running process.
 TEST(ProcPidFile, SubprocessRunning) {
   char buf[1];
@@ -2159,6 +2207,18 @@ TEST(Proc, PidTidIOAccounting) {
   noop.Join();
 }
 
+TEST(Proc, Statfs) {
+  struct statfs st;
+  EXPECT_THAT(statfs("/proc", &st), SyscallSucceeds());
+  if (IsRunningWithVFS1()) {
+    EXPECT_EQ(st.f_type, ANON_INODE_FS_MAGIC);
+  } else {
+    EXPECT_EQ(st.f_type, PROC_SUPER_MAGIC);
+  }
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 4fab097f4..23677e296 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -39,6 +39,7 @@ namespace testing {
 namespace {
 
 constexpr const char kProcNet[] = "/proc/net";
+constexpr const char kIpForward[] = "/proc/sys/net/ipv4/ip_forward";
 
 TEST(ProcNetSymlinkTarget, FileMode) {
   struct stat s;
@@ -515,6 +516,46 @@ TEST(ProcSysNetIpv4Recovery, CanReadAndWrite) {
               SyscallSucceedsWithValue(sizeof(kMessage)));
   EXPECT_EQ(strcmp(buf, "100\n"), 0);
 }
+
+TEST(ProcSysNetIpv4IpForward, Exists) {
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kIpForward, O_RDONLY));
+}
+
+TEST(ProcSysNetIpv4IpForward, DefaultValueEqZero) {
+  // Test is only valid in sandbox. Not hermetic in native tests
+  // running on a arbitrary machine.
+  SKIP_IF(!IsRunningOnGvisor());
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kIpForward, O_RDONLY));
+
+  char buf = 101;
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_EQ(buf, '0') << "unexpected ip_forward: " << buf;
+}
+
+TEST(ProcSysNetIpv4IpForward, CanReadAndWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kIpForward, O_RDWR));
+
+  char buf;
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_TRUE(buf == '0' || buf == '1') << "unexpected ip_forward: " << buf;
+
+  // constexpr char to_write = '1';
+  char to_write = (buf == '1') ? '0' : '1';
+  EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0),
+              SyscallSucceedsWithValue(sizeof(to_write)));
+
+  buf = 0;
+  EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  EXPECT_EQ(buf, to_write);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 9fb1b3a2c..738923822 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -191,7 +191,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
     // amount of whitespace).
     if (!entry) {
       std::cerr << "smaps line not considered a maps line: "
-                << maybe_maps_entry.error_message() << std::endl;
+                << maybe_maps_entry.error().message() << std::endl;
       return PosixError(
           EINVAL,
           absl::StrCat("smaps field line without preceding maps line: ", l));
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index f9392b9e0..0b174e2be 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -51,6 +51,7 @@ using ::testing::AnyOf;
 using ::testing::Contains;
 using ::testing::Eq;
 using ::testing::Not;
+using SubprocessCallback = std::function<void()>;
 
 // Tests Unix98 pseudoterminals.
 //
@@ -389,15 +390,15 @@ TEST(PtyTrunc, Truncate) {
   // (f)truncate should.
   FileDescriptor master =
       ASSERT_NO_ERRNO_AND_VALUE(Open(kMasterPath, O_RDWR | O_TRUNC));
-  int n = ASSERT_NO_ERRNO_AND_VALUE(SlaveID(master));
+  int n = ASSERT_NO_ERRNO_AND_VALUE(ReplicaID(master));
   std::string spath = absl::StrCat("/dev/pts/", n);
-  FileDescriptor slave =
+  FileDescriptor replica =
       ASSERT_NO_ERRNO_AND_VALUE(Open(spath, O_RDWR | O_NONBLOCK | O_TRUNC));
 
   EXPECT_THAT(truncate(kMasterPath, 0), SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(truncate(spath.c_str(), 0), SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(ftruncate(master.get(), 0), SyscallFailsWithErrno(EINVAL));
-  EXPECT_THAT(ftruncate(slave.get(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(replica.get(), 0), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(BasicPtyTest, StatUnopenedMaster) {
@@ -453,16 +454,16 @@ void ExpectReadable(const FileDescriptor& fd, int expected, char* buf) {
   EXPECT_EQ(expected, n);
 }
 
-TEST(BasicPtyTest, OpenMasterSlave) {
+TEST(BasicPtyTest, OpenMasterReplica) {
   FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
-  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 }
 
-// The slave entry in /dev/pts/ disappears when the master is closed, even if
-// the slave is still open.
-TEST(BasicPtyTest, SlaveEntryGoneAfterMasterClose) {
+// The replica entry in /dev/pts/ disappears when the master is closed, even if
+// the replica is still open.
+TEST(BasicPtyTest, ReplicaEntryGoneAfterMasterClose) {
   FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
-  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 
   // Get pty index.
   int index = -1;
@@ -482,12 +483,12 @@ TEST(BasicPtyTest, Getdents) {
   FileDescriptor master1 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
   int index1 = -1;
   ASSERT_THAT(ioctl(master1.get(), TIOCGPTN, &index1), SyscallSucceeds());
-  FileDescriptor slave1 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master1));
+  FileDescriptor replica1 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master1));
 
   FileDescriptor master2 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
   int index2 = -1;
   ASSERT_THAT(ioctl(master2.get(), TIOCGPTN, &index2), SyscallSucceeds());
-  FileDescriptor slave2 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master2));
+  FileDescriptor replica2 = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master2));
 
   // The directory contains ptmx, index1, and index2. (Plus any additional PTYs
   // unrelated to this test.)
@@ -519,59 +520,60 @@ class PtyTest : public ::testing::Test {
  protected:
   void SetUp() override {
     master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-    slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master_));
   }
 
   void DisableCanonical() {
     struct kernel_termios t = {};
-    EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+    EXPECT_THAT(ioctl(replica_.get(), TCGETS, &t), SyscallSucceeds());
     t.c_lflag &= ~ICANON;
-    EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+    EXPECT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
   }
 
   void EnableCanonical() {
     struct kernel_termios t = {};
-    EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+    EXPECT_THAT(ioctl(replica_.get(), TCGETS, &t), SyscallSucceeds());
     t.c_lflag |= ICANON;
-    EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+    EXPECT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
   }
 
-  // Master and slave ends of the PTY. Non-blocking.
+  // Master and replica ends of the PTY. Non-blocking.
   FileDescriptor master_;
-  FileDescriptor slave_;
+  FileDescriptor replica_;
 };
 
-// Master to slave sanity test.
-TEST_F(PtyTest, WriteMasterToSlave) {
-  // N.B. by default, the slave reads nothing until the master writes a newline.
+// Master to replica sanity test.
+TEST_F(PtyTest, WriteMasterToReplica) {
+  // N.B. by default, the replica reads nothing until the master writes a
+  // newline.
   constexpr char kBuf[] = "hello\n";
 
   EXPECT_THAT(WriteFd(master_.get(), kBuf, sizeof(kBuf) - 1),
               SyscallSucceedsWithValue(sizeof(kBuf) - 1));
 
-  // Linux moves data from the master to the slave via async work scheduled via
-  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // Linux moves data from the master to the replica via async work scheduled
+  // via tty_flip_buffer_push. Since it is asynchronous, the data may not be
   // available for reading immediately. Instead we must poll and assert that it
   // becomes available "soon".
 
   char buf[sizeof(kBuf)] = {};
-  ExpectReadable(slave_, sizeof(buf) - 1, buf);
+  ExpectReadable(replica_, sizeof(buf) - 1, buf);
 
   EXPECT_EQ(memcmp(buf, kBuf, sizeof(kBuf)), 0);
 }
 
-// Slave to master sanity test.
-TEST_F(PtyTest, WriteSlaveToMaster) {
-  // N.B. by default, the master reads nothing until the slave writes a newline,
-  // and the master gets a carriage return.
+// Replica to master sanity test.
+TEST_F(PtyTest, WriteReplicaToMaster) {
+  // N.B. by default, the master reads nothing until the replica writes a
+  // newline, and the master gets a carriage return.
   constexpr char kInput[] = "hello\n";
   constexpr char kExpected[] = "hello\r\n";
 
-  EXPECT_THAT(WriteFd(slave_.get(), kInput, sizeof(kInput) - 1),
+  EXPECT_THAT(WriteFd(replica_.get(), kInput, sizeof(kInput) - 1),
               SyscallSucceedsWithValue(sizeof(kInput) - 1));
 
-  // Linux moves data from the master to the slave via async work scheduled via
-  // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+  // Linux moves data from the master to the replica via async work scheduled
+  // via tty_flip_buffer_push. Since it is asynchronous, the data may not be
   // available for reading immediately. Instead we must poll and assert that it
   // becomes available "soon".
 
@@ -587,32 +589,33 @@ TEST_F(PtyTest, WriteInvalidUTF8) {
               SyscallSucceedsWithValue(sizeof(c)));
 }
 
-// Both the master and slave report the standard default termios settings.
+// Both the master and replica report the standard default termios settings.
 //
-// Note that TCGETS on the master actually redirects to the slave (see comment
+// Note that TCGETS on the master actually redirects to the replica (see comment
 // on MasterTermiosUnchangable).
 TEST_F(PtyTest, DefaultTermios) {
   struct kernel_termios t = {};
-  EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+  EXPECT_THAT(ioctl(replica_.get(), TCGETS, &t), SyscallSucceeds());
   EXPECT_EQ(t, DefaultTermios());
 
   EXPECT_THAT(ioctl(master_.get(), TCGETS, &t), SyscallSucceeds());
   EXPECT_EQ(t, DefaultTermios());
 }
 
-// Changing termios from the master actually affects the slave.
+// Changing termios from the master actually affects the replica.
 //
-// TCSETS on the master actually redirects to the slave (see comment on
+// TCSETS on the master actually redirects to the replica (see comment on
 // MasterTermiosUnchangable).
-TEST_F(PtyTest, TermiosAffectsSlave) {
+TEST_F(PtyTest, TermiosAffectsReplica) {
   struct kernel_termios master_termios = {};
   EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
   master_termios.c_lflag ^= ICANON;
   EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
 
-  struct kernel_termios slave_termios = {};
-  EXPECT_THAT(ioctl(slave_.get(), TCGETS, &slave_termios), SyscallSucceeds());
-  EXPECT_EQ(master_termios, slave_termios);
+  struct kernel_termios replica_termios = {};
+  EXPECT_THAT(ioctl(replica_.get(), TCGETS, &replica_termios),
+              SyscallSucceeds());
+  EXPECT_EQ(master_termios, replica_termios);
 }
 
 // The master end of the pty has termios:
@@ -627,7 +630,7 @@ TEST_F(PtyTest, TermiosAffectsSlave) {
 //
 // (From drivers/tty/pty.c:unix98_pty_init)
 //
-// All termios control ioctls on the master actually redirect to the slave
+// All termios control ioctls on the master actually redirect to the replica
 // (drivers/tty/tty_ioctl.c:tty_mode_ioctl), making it impossible to change the
 // master termios.
 //
@@ -640,7 +643,7 @@ TEST_F(PtyTest, MasterTermiosUnchangable) {
   EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
 
   char c = '\r';
-  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   ExpectReadable(master_, 1, &c);
   EXPECT_EQ(c, '\r');  // ICRNL had no effect!
@@ -653,15 +656,15 @@ TEST_F(PtyTest, TermiosICRNL) {
   struct kernel_termios t = DefaultTermios();
   t.c_iflag |= ICRNL;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\r';
   ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
-  ExpectReadable(slave_, 1, &c);
+  ExpectReadable(replica_, 1, &c);
   EXPECT_EQ(c, '\n');
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // ONLCR rewrites output \n to \r\n.
@@ -669,42 +672,42 @@ TEST_F(PtyTest, TermiosONLCR) {
   struct kernel_termios t = DefaultTermios();
   t.c_oflag |= ONLCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\n';
-  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Extra byte for NUL for EXPECT_STREQ.
   char buf[3] = {};
   ExpectReadable(master_, 2, buf);
   EXPECT_STREQ(buf, "\r\n");
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, TermiosIGNCR) {
   struct kernel_termios t = DefaultTermios();
   t.c_iflag |= IGNCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\r';
   ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
-  ASSERT_THAT(PollAndReadFd(slave_.get(), &c, 1, kTimeout),
+  ASSERT_THAT(PollAndReadFd(replica_.get(), &c, 1, kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 }
 
-// Test that we can successfully poll for readable data from the slave.
-TEST_F(PtyTest, TermiosPollSlave) {
+// Test that we can successfully poll for readable data from the replica.
+TEST_F(PtyTest, TermiosPollReplica) {
   struct kernel_termios t = DefaultTermios();
   t.c_iflag |= IGNCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   absl::Notification notify;
-  int sfd = slave_.get();
+  int sfd = replica_.get();
   ScopedThread th([sfd, &notify]() {
     notify.Notify();
 
@@ -753,33 +756,33 @@ TEST_F(PtyTest, TermiosPollMaster) {
   absl::SleepFor(absl::Seconds(1));
 
   char s[] = "foo\n";
-  ASSERT_THAT(WriteFd(slave_.get(), s, strlen(s) + 1), SyscallSucceeds());
+  ASSERT_THAT(WriteFd(replica_.get(), s, strlen(s) + 1), SyscallSucceeds());
 }
 
 TEST_F(PtyTest, TermiosINLCR) {
   struct kernel_termios t = DefaultTermios();
   t.c_iflag |= INLCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   char c = '\n';
   ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
-  ExpectReadable(slave_, 1, &c);
+  ExpectReadable(replica_, 1, &c);
   EXPECT_EQ(c, '\r');
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, TermiosONOCR) {
   struct kernel_termios t = DefaultTermios();
   t.c_oflag |= ONOCR;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   // The terminal is at column 0, so there should be no CR to read.
   char c = '\r';
-  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
   ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
@@ -789,7 +792,7 @@ TEST_F(PtyTest, TermiosONOCR) {
   // out of the other end.
   constexpr char kInput[] = "foo\r";
   constexpr int kInputSize = sizeof(kInput) - 1;
-  ASSERT_THAT(WriteFd(slave_.get(), kInput, kInputSize),
+  ASSERT_THAT(WriteFd(replica_.get(), kInput, kInputSize),
               SyscallSucceedsWithValue(kInputSize));
 
   char buf[kInputSize] = {};
@@ -800,7 +803,7 @@ TEST_F(PtyTest, TermiosONOCR) {
   ExpectFinished(master_);
 
   // Terminal should be at column 0 again, so no CR can be read.
-  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   // Nothing to read.
   ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
@@ -811,11 +814,11 @@ TEST_F(PtyTest, TermiosOCRNL) {
   struct kernel_termios t = DefaultTermios();
   t.c_oflag |= OCRNL;
   t.c_lflag &= ~ICANON;  // for byte-by-byte reading.
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
 
   // The terminal is at column 0, so there should be no CR to read.
   char c = '\r';
-  ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+  ASSERT_THAT(WriteFd(replica_.get(), &c, 1), SyscallSucceedsWithValue(1));
 
   ExpectReadable(master_, 1, &c);
   EXPECT_EQ(c, '\n');
@@ -831,24 +834,24 @@ TEST_F(PtyTest, VEOLTermination) {
   ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
               SyscallSucceedsWithValue(sizeof(kInput)));
   char buf[sizeof(kInput)] = {};
-  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+  ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(kInput), kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 
   // Set the EOL character to '=' and write it.
   constexpr char delim = '=';
   struct kernel_termios t = DefaultTermios();
   t.c_cc[VEOL] = delim;
-  ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TCSETS, &t), SyscallSucceeds());
   ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now we can read, as sending EOL caused the line to become available.
-  ExpectReadable(slave_, sizeof(kInput), buf);
+  ExpectReadable(replica_, sizeof(kInput), buf);
   EXPECT_EQ(memcmp(buf, kInput, sizeof(kInput)), 0);
 
-  ExpectReadable(slave_, 1, buf);
+  ExpectReadable(replica_, 1, buf);
   EXPECT_EQ(buf[0], '=');
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // Tests that we can write more than the 4096 character limit, then a
@@ -864,9 +867,9 @@ TEST_F(PtyTest, CanonBigWrite) {
 
   // We can read the line.
   char buf[kMaxLineSize] = {};
-  ExpectReadable(slave_, kMaxLineSize, buf);
+  ExpectReadable(replica_, kMaxLineSize, buf);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // Tests that data written in canonical mode can be read immediately once
@@ -880,15 +883,15 @@ TEST_F(PtyTest, SwitchCanonToNoncanon) {
 
   // Nothing available yet.
   char buf[sizeof(kInput)] = {};
-  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+  ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(kInput), kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 
   DisableCanonical();
 
-  ExpectReadable(slave_, sizeof(kInput), buf);
+  ExpectReadable(replica_, sizeof(kInput), buf);
   EXPECT_STREQ(buf, kInput);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchCanonToNonCanonNewline) {
@@ -901,10 +904,10 @@ TEST_F(PtyTest, SwitchCanonToNonCanonNewline) {
 
   // We can read the line.
   char buf[sizeof(kInput)] = {};
-  ExpectReadable(slave_, sizeof(kInput), buf);
+  ExpectReadable(replica_, sizeof(kInput), buf);
   EXPECT_STREQ(buf, kInput);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
@@ -917,7 +920,7 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
   ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
               SyscallSucceedsWithValue(kWriteLen));
   // Wait for the input queue to fill.
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
   constexpr char delim = '\n';
   ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
@@ -925,12 +928,12 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
 
   // We can read the line.
   char buf[kMaxLineSize] = {};
-  ExpectReadable(slave_, kMaxLineSize - 1, buf);
+  ExpectReadable(replica_, kMaxLineSize - 1, buf);
 
   // We can also read the remaining characters.
-  ExpectReadable(slave_, 6, buf);
+  ExpectReadable(replica_, 6, buf);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchNoncanonToCanonNoNewline) {
@@ -942,15 +945,15 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNoNewline) {
   ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput) - 1),
               SyscallSucceedsWithValue(sizeof(kInput) - 1));
 
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput) - 1));
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(kInput) - 1));
   EnableCanonical();
 
   // We can read the line.
   char buf[sizeof(kInput)] = {};
-  ExpectReadable(slave_, sizeof(kInput) - 1, buf);
+  ExpectReadable(replica_, sizeof(kInput) - 1, buf);
   EXPECT_STREQ(buf, kInput);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchNoncanonToCanonNoNewlineBig) {
@@ -964,14 +967,14 @@ TEST_F(PtyTest, SwitchNoncanonToCanonNoNewlineBig) {
   ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
               SyscallSucceedsWithValue(kWriteLen));
 
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
   EnableCanonical();
 
   // We can read the line.
   char buf[kMaxLineSize] = {};
-  ExpectReadable(slave_, kMaxLineSize - 1, buf);
+  ExpectReadable(replica_, kMaxLineSize - 1, buf);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // Tests that we can write over the 4095 noncanonical limit, then read out
@@ -990,17 +993,17 @@ TEST_F(PtyTest, NoncanonBigWrite) {
   }
 
   // We should be able to read out everything. Sleep a bit so that Linux has a
-  // chance to move data from the master to the slave.
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+  // chance to move data from the master to the replica.
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kMaxLineSize - 1));
   for (int i = 0; i < kInputSize; i++) {
     // This makes too many syscalls for save/restore.
     const DisableSave ds;
     char c;
-    ExpectReadable(slave_, 1, &c);
+    ExpectReadable(replica_, 1, &c);
     ASSERT_EQ(c, kInput);
   }
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // ICANON doesn't make input available until a line delimiter is typed.
@@ -1015,18 +1018,18 @@ TEST_F(PtyTest, TermiosICANONNewline) {
   char buf[5] = {};
 
   // Nothing available yet.
-  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+  ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(input), kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
 
   char delim = '\n';
   ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now it is available.
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(input) + 1));
-  ExpectReadable(slave_, sizeof(input) + 1, buf);
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(input) + 1));
+  ExpectReadable(replica_, sizeof(input) + 1, buf);
   EXPECT_STREQ(buf, "abc\n");
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // ICANON doesn't make input available until a line delimiter is typed.
@@ -1041,16 +1044,16 @@ TEST_F(PtyTest, TermiosICANONEOF) {
   char buf[4] = {};
 
   // Nothing available yet.
-  ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+  ASSERT_THAT(PollAndReadFd(replica_.get(), buf, sizeof(input), kTimeout),
               PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
   char delim = ControlCharacter('D');
   ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
 
   // Now it is available. Note that ^D is not included.
-  ExpectReadable(slave_, sizeof(input), buf);
+  ExpectReadable(replica_, sizeof(input), buf);
   EXPECT_STREQ(buf, "abc");
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 // ICANON limits us to 4096 bytes including a terminating character. Anything
@@ -1076,12 +1079,12 @@ TEST_F(PtyTest, CanonDiscard) {
   // There should be multiple truncated lines available to read.
   for (int i = 0; i < kIter; i++) {
     char buf[kInputSize] = {};
-    ExpectReadable(slave_, kMaxLineSize, buf);
+    ExpectReadable(replica_, kMaxLineSize, buf);
     EXPECT_EQ(buf[kMaxLineSize - 1], delim);
     EXPECT_EQ(buf[kMaxLineSize - 2], kInput);
   }
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, CanonMultiline) {
@@ -1096,15 +1099,15 @@ TEST_F(PtyTest, CanonMultiline) {
 
   // Get the first line.
   char line1[8] = {};
-  ExpectReadable(slave_, sizeof(kInput1) - 1, line1);
+  ExpectReadable(replica_, sizeof(kInput1) - 1, line1);
   EXPECT_STREQ(line1, kInput1);
 
   // Get the second line.
   char line2[8] = {};
-  ExpectReadable(slave_, sizeof(kInput2) - 1, line2);
+  ExpectReadable(replica_, sizeof(kInput2) - 1, line2);
   EXPECT_STREQ(line2, kInput2);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchNoncanonToCanonMultiline) {
@@ -1121,15 +1124,15 @@ TEST_F(PtyTest, SwitchNoncanonToCanonMultiline) {
               SyscallSucceedsWithValue(sizeof(kInput2) - 1));
 
   ASSERT_NO_ERRNO(
-      WaitUntilReceived(slave_.get(), sizeof(kInput1) + sizeof(kInput2) - 2));
+      WaitUntilReceived(replica_.get(), sizeof(kInput1) + sizeof(kInput2) - 2));
   EnableCanonical();
 
   // Get all together as one line.
   char line[9] = {};
-  ExpectReadable(slave_, 8, line);
+  ExpectReadable(replica_, 8, line);
   EXPECT_STREQ(line, kExpected);
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, SwitchTwiceMultiline) {
@@ -1146,15 +1149,15 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
   // All written characters have to make it into the input queue before
   // canonical mode is re-enabled. If the final '!' character hasn't been
   // enqueued before canonical mode is re-enabled, it won't be readable.
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kExpected.size()));
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), kExpected.size()));
   EnableCanonical();
 
   // Get all together as one line.
   char line[10] = {};
-  ExpectReadable(slave_, 9, line);
+  ExpectReadable(replica_, 9, line);
   EXPECT_STREQ(line, kExpected.c_str());
 
-  ExpectFinished(slave_);
+  ExpectFinished(replica_);
 }
 
 TEST_F(PtyTest, QueueSize) {
@@ -1162,7 +1165,7 @@ TEST_F(PtyTest, QueueSize) {
   constexpr char kInput1[] = "GO\n";
   ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
               SyscallSucceedsWithValue(sizeof(kInput1) - 1));
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), sizeof(kInput1) - 1));
 
   // Ensure that writing more (beyond what is readable) does not impact the
   // readable size.
@@ -1171,7 +1174,7 @@ TEST_F(PtyTest, QueueSize) {
   ASSERT_THAT(WriteFd(master_.get(), input, kMaxLineSize),
               SyscallSucceedsWithValue(kMaxLineSize));
   int inputBufSize = ASSERT_NO_ERRNO_AND_VALUE(
-      WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+      WaitUntilReceived(replica_.get(), sizeof(kInput1) - 1));
   EXPECT_EQ(inputBufSize, sizeof(kInput1) - 1);
 }
 
@@ -1196,9 +1199,9 @@ TEST_F(PtyTest, PartialBadBuffer) {
   EXPECT_THAT(WriteFd(master_.get(), kBuf, size),
               SyscallSucceedsWithValue(size));
 
-  // Read from the slave into bad_buffer.
-  ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), size));
-  EXPECT_THAT(ReadFd(slave_.get(), bad_buffer, size),
+  // Read from the replica into bad_buffer.
+  ASSERT_NO_ERRNO(WaitUntilReceived(replica_.get(), size));
+  EXPECT_THAT(ReadFd(replica_.get(), bad_buffer, size),
               SyscallFailsWithErrno(EFAULT));
 
   EXPECT_THAT(munmap(addr, 2 * kPageSize), SyscallSucceeds()) << addr;
@@ -1218,16 +1221,16 @@ TEST_F(PtyTest, SimpleEcho) {
 
 TEST_F(PtyTest, GetWindowSize) {
   struct winsize ws;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &ws), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TIOCGWINSZ, &ws), SyscallSucceeds());
   EXPECT_EQ(ws.ws_row, 0);
   EXPECT_EQ(ws.ws_col, 0);
 }
 
-TEST_F(PtyTest, SetSlaveWindowSize) {
+TEST_F(PtyTest, SetReplicaWindowSize) {
   constexpr uint16_t kRows = 343;
   constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
   struct winsize retrieved_ws = {};
   ASSERT_THAT(ioctl(master_.get(), TIOCGWINSZ, &retrieved_ws),
@@ -1243,7 +1246,7 @@ TEST_F(PtyTest, SetMasterWindowSize) {
   ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
   struct winsize retrieved_ws = {};
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &retrieved_ws),
+  ASSERT_THAT(ioctl(replica_.get(), TIOCGWINSZ, &retrieved_ws),
               SyscallSucceeds());
   EXPECT_EQ(retrieved_ws.ws_row, kRows);
   EXPECT_EQ(retrieved_ws.ws_col, kCols);
@@ -1253,7 +1256,7 @@ class JobControlTest : public ::testing::Test {
  protected:
   void SetUp() override {
     master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-    slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+    replica_ = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master_));
 
     // Make this a session leader, which also drops the controlling terminal.
     // In the gVisor test environment, this test will be run as the session
@@ -1263,61 +1266,82 @@ class JobControlTest : public ::testing::Test {
     }
   }
 
-  // Master and slave ends of the PTY. Non-blocking.
+  PosixError RunInChild(SubprocessCallback childFunc) {
+    pid_t child = fork();
+    if (!child) {
+      childFunc();
+      _exit(0);
+    }
+    int wstatus;
+    if (waitpid(child, &wstatus, 0) != child) {
+      return PosixError(
+          errno, absl::StrCat("child failed with wait status: ", wstatus));
+    }
+    return PosixError(wstatus, "process returned");
+  }
+
+  // Master and replica ends of the PTY. Non-blocking.
   FileDescriptor master_;
-  FileDescriptor slave_;
+  FileDescriptor replica_;
 };
 
 TEST_F(JobControlTest, SetTTYMaster) {
-  ASSERT_THAT(ioctl(master_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  auto res = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(master_.get(), TIOCSCTTY, 0));
+  });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, SetTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  auto res = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(ioctl(!replica_.get(), TIOCSCTTY, 0));
+  });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, SetTTYNonLeader) {
   // Fork a process that won't be the session leader.
-  pid_t child = fork();
-  if (!child) {
-    // We shouldn't be able to set the terminal.
-    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 0));
-    _exit(0);
-  }
-
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
+  auto res =
+      RunInChild([=]() { TEST_PCHECK(ioctl(replica_.get(), TIOCSCTTY, 0)); });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, SetTTYBadArg) {
-  // Despite the man page saying arg should be 0 here, Linux doesn't actually
-  // check.
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 1), SyscallSucceeds());
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+  auto res = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 1));
+  });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, SetTTYDifferentSession) {
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Fork, join a new session, and try to steal the parent's controlling
-  // terminal, which should fail.
-  pid_t child = fork();
-  if (!child) {
+  auto res = RunInChild([=]() {
     TEST_PCHECK(setsid() >= 0);
-    // We shouldn't be able to steal the terminal.
-    TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 1));
-    _exit(0);
-  }
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 1));
 
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
+    // Fork, join a new session, and try to steal the parent's controlling
+    // terminal, which should fail.
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      TEST_PCHECK(setsid() >= 0);
+      // We shouldn't be able to steal the terminal.
+      TEST_PCHECK(ioctl(replica_.get(), TIOCSCTTY, 1));
+      _exit(0);
+    }
+
+    int gcwstatus;
+    TEST_PCHECK(waitpid(grandchild, &gcwstatus, 0) == grandchild);
+    TEST_PCHECK(gcwstatus == 0);
+  });
 }
 
 TEST_F(JobControlTest, ReleaseTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TIOCSCTTY, 0), SyscallSucceeds());
 
   // Make sure we're ignoring SIGHUP, which will be sent to this process once we
   // disconnect they TTY.
@@ -1327,48 +1351,60 @@ TEST_F(JobControlTest, ReleaseTTY) {
   sigemptyset(&sa.sa_mask);
   struct sigaction old_sa;
   EXPECT_THAT(sigaction(SIGHUP, &sa, &old_sa), SyscallSucceeds());
-  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+  EXPECT_THAT(ioctl(replica_.get(), TIOCNOTTY), SyscallSucceeds());
   EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
 }
 
 TEST_F(JobControlTest, ReleaseUnsetTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+  ASSERT_THAT(ioctl(replica_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
 }
 
 TEST_F(JobControlTest, ReleaseWrongTTY) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  ASSERT_THAT(ioctl(master_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+  auto res = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
+    TEST_PCHECK(ioctl(master_.get(), TIOCNOTTY) < 0 && errno == ENOTTY);
+  });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, ReleaseTTYNonLeader) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  auto ret = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
 
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!ioctl(slave_.get(), TIOCNOTTY));
-    _exit(0);
-  }
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      TEST_PCHECK(!ioctl(replica_.get(), TIOCNOTTY));
+      _exit(0);
+    }
 
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
+    int wstatus;
+    TEST_PCHECK(waitpid(grandchild, &wstatus, 0) == grandchild);
+    TEST_PCHECK(wstatus == 0);
+  });
+  ASSERT_NO_ERRNO(ret);
 }
 
 TEST_F(JobControlTest, ReleaseTTYDifferentSession) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  pid_t child = fork();
-  if (!child) {
-    // Join a new session, then try to disconnect.
+  auto ret = RunInChild([=]() {
     TEST_PCHECK(setsid() >= 0);
-    TEST_PCHECK(ioctl(slave_.get(), TIOCNOTTY));
-    _exit(0);
-  }
 
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_EQ(wstatus, 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
+
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      // Join a new session, then try to disconnect.
+      TEST_PCHECK(setsid() >= 0);
+      TEST_PCHECK(ioctl(replica_.get(), TIOCNOTTY));
+      _exit(0);
+    }
+
+    int wstatus;
+    TEST_PCHECK(waitpid(grandchild, &wstatus, 0) == grandchild);
+    TEST_PCHECK(wstatus == 0);
+  });
+  ASSERT_NO_ERRNO(ret);
 }
 
 // Used by the child process spawned in ReleaseTTYSignals to track received
@@ -1387,7 +1423,7 @@ void sig_handler(int signum) { received |= signum; }
 // - Checks that thread 1 got both signals
 // - Checks that thread 2 didn't get any signals.
 TEST_F(JobControlTest, ReleaseTTYSignals) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  ASSERT_THAT(ioctl(replica_.get(), TIOCSCTTY, 0), SyscallSucceeds());
 
   received = 0;
   struct sigaction sa = {};
@@ -1439,7 +1475,7 @@ TEST_F(JobControlTest, ReleaseTTYSignals) {
 
   // Release the controlling terminal, sending SIGHUP and SIGCONT to all other
   // processes in this process group.
-  EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+  EXPECT_THAT(ioctl(replica_.get(), TIOCNOTTY), SyscallSucceeds());
 
   EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
 
@@ -1456,20 +1492,21 @@ TEST_F(JobControlTest, ReleaseTTYSignals) {
 }
 
 TEST_F(JobControlTest, GetForegroundProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-  pid_t foreground_pgid;
-  pid_t pid;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
-              SyscallSucceeds());
-  ASSERT_THAT(pid = getpid(), SyscallSucceeds());
-
-  ASSERT_EQ(foreground_pgid, pid);
+  auto res = RunInChild([=]() {
+    pid_t pid, foreground_pgid;
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 1));
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCGPGRP, &foreground_pgid));
+    TEST_PCHECK((pid = getpid()) >= 0);
+    TEST_PCHECK(pid == foreground_pgid);
+  });
+  ASSERT_NO_ERRNO(res);
 }
 
 TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) {
   // At this point there's no controlling terminal, so TIOCGPGRP should fail.
   pid_t foreground_pgid;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
+  ASSERT_THAT(ioctl(replica_.get(), TIOCGPGRP, &foreground_pgid),
               SyscallFailsWithErrno(ENOTTY));
 }
 
@@ -1479,113 +1516,125 @@ TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) {
 // - sets that child as the foreground process group
 // - kills its child and sets itself as the foreground process group.
 TEST_F(JobControlTest, SetForegroundProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp.
-  struct sigaction sa = {};
-  sa.sa_handler = SIG_IGN;
-  sa.sa_flags = 0;
-  sigemptyset(&sa.sa_mask);
-  sigaction(SIGTTOU, &sa, NULL);
-
-  // Set ourself as the foreground process group.
-  ASSERT_THAT(tcsetpgrp(slave_.get(), getpgid(0)), SyscallSucceeds());
-
-  // Create a new process that just waits to be signaled.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!pause());
-    // We should never reach this.
-    _exit(1);
-  }
-
-  // Make the child its own process group, then make it the controlling process
-  // group of the terminal.
-  ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
-  ASSERT_THAT(tcsetpgrp(slave_.get(), child), SyscallSucceeds());
+  auto res = RunInChild([=]() {
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
+
+    // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp.
+    struct sigaction sa = {};
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    sigemptyset(&sa.sa_mask);
+    sigaction(SIGTTOU, &sa, NULL);
+
+    // Set ourself as the foreground process group.
+    TEST_PCHECK(!tcsetpgrp(replica_.get(), getpgid(0)));
+
+    // Create a new process that just waits to be signaled.
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      TEST_PCHECK(!pause());
+      // We should never reach this.
+      _exit(1);
+    }
 
-  // Sanity check - we're still the controlling session.
-  ASSERT_EQ(getsid(0), getsid(child));
+    // Make the child its own process group, then make it the controlling
+    // process group of the terminal.
+    TEST_PCHECK(!setpgid(grandchild, grandchild));
+    TEST_PCHECK(!tcsetpgrp(replica_.get(), grandchild));
 
-  // Signal the child, wait for it to exit, then retake the terminal.
-  ASSERT_THAT(kill(child, SIGTERM), SyscallSucceeds());
-  int wstatus;
-  ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  ASSERT_TRUE(WIFSIGNALED(wstatus));
-  ASSERT_EQ(WTERMSIG(wstatus), SIGTERM);
+    // Sanity check - we're still the controlling session.
+    TEST_PCHECK(getsid(0) == getsid(grandchild));
 
-  // Set ourself as the foreground process.
-  pid_t pgid;
-  ASSERT_THAT(pgid = getpgid(0), SyscallSucceeds());
-  ASSERT_THAT(tcsetpgrp(slave_.get(), pgid), SyscallSucceeds());
+    // Signal the child, wait for it to exit, then retake the terminal.
+    TEST_PCHECK(!kill(grandchild, SIGTERM));
+    int wstatus;
+    TEST_PCHECK(waitpid(grandchild, &wstatus, 0) == grandchild);
+    TEST_PCHECK(WIFSIGNALED(wstatus));
+    TEST_PCHECK(WTERMSIG(wstatus) == SIGTERM);
+
+    // Set ourself as the foreground process.
+    pid_t pgid;
+    TEST_PCHECK(pgid = getpgid(0) == 0);
+    TEST_PCHECK(!tcsetpgrp(replica_.get(), pgid));
+  });
 }
 
 TEST_F(JobControlTest, SetForegroundProcessGroupWrongTTY) {
   pid_t pid = getpid();
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
+  ASSERT_THAT(ioctl(replica_.get(), TIOCSPGRP, &pid),
               SyscallFailsWithErrno(ENOTTY));
 }
 
 TEST_F(JobControlTest, SetForegroundProcessGroupNegPgid) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  auto ret = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
 
-  pid_t pid = -1;
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
-              SyscallFailsWithErrno(EINVAL));
+    pid_t pid = -1;
+    TEST_PCHECK(ioctl(replica_.get(), TIOCSPGRP, &pid) && errno == EINVAL);
+  });
+  ASSERT_NO_ERRNO(ret);
 }
 
 TEST_F(JobControlTest, SetForegroundProcessGroupEmptyProcessGroup) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
-
-  // Create a new process, put it in a new process group, make that group the
-  // foreground process group, then have the process wait.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(!setpgid(0, 0));
-    _exit(0);
-  }
+  auto ret = RunInChild([=]() {
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
+
+    // Create a new process, put it in a new process group, make that group the
+    // foreground process group, then have the process wait.
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      TEST_PCHECK(!setpgid(0, 0));
+      _exit(0);
+    }
 
-  // Wait for the child to exit.
-  int wstatus;
-  EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  // The child's process group doesn't exist anymore - this should fail.
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
-              SyscallFailsWithErrno(ESRCH));
+    // Wait for the child to exit.
+    int wstatus;
+    TEST_PCHECK(waitpid(grandchild, &wstatus, 0) == grandchild);
+    // The child's process group doesn't exist anymore - this should fail.
+    TEST_PCHECK(ioctl(replica_.get(), TIOCSPGRP, &grandchild) != 0 &&
+                errno == ESRCH);
+  });
 }
 
 TEST_F(JobControlTest, SetForegroundProcessGroupDifferentSession) {
-  ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  auto ret = RunInChild([=]() {
+    TEST_PCHECK(setsid() >= 0);
+    TEST_PCHECK(!ioctl(replica_.get(), TIOCSCTTY, 0));
 
-  int sync_setsid[2];
-  int sync_exit[2];
-  ASSERT_THAT(pipe(sync_setsid), SyscallSucceeds());
-  ASSERT_THAT(pipe(sync_exit), SyscallSucceeds());
+    int sync_setsid[2];
+    int sync_exit[2];
+    TEST_PCHECK(pipe(sync_setsid) >= 0);
+    TEST_PCHECK(pipe(sync_exit) >= 0);
 
-  // Create a new process and put it in a new session.
-  pid_t child = fork();
-  if (!child) {
-    TEST_PCHECK(setsid() >= 0);
-    // Tell the parent we're in a new session.
-    char c = 'c';
-    TEST_PCHECK(WriteFd(sync_setsid[1], &c, 1) == 1);
-    TEST_PCHECK(ReadFd(sync_exit[0], &c, 1) == 1);
-    _exit(0);
-  }
+    // Create a new process and put it in a new session.
+    pid_t grandchild = fork();
+    if (!grandchild) {
+      TEST_PCHECK(setsid() >= 0);
+      // Tell the parent we're in a new session.
+      char c = 'c';
+      TEST_PCHECK(WriteFd(sync_setsid[1], &c, 1) == 1);
+      TEST_PCHECK(ReadFd(sync_exit[0], &c, 1) == 1);
+      _exit(0);
+    }
 
-  // Wait for the child to tell us it's in a new session.
-  char c = 'c';
-  ASSERT_THAT(ReadFd(sync_setsid[0], &c, 1), SyscallSucceedsWithValue(1));
+    // Wait for the child to tell us it's in a new session.
+    char c = 'c';
+    TEST_PCHECK(ReadFd(sync_setsid[0], &c, 1) == 1);
 
-  // Child is in a new session, so we can't make it the foregroup process group.
-  EXPECT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
-              SyscallFailsWithErrno(EPERM));
+    // Child is in a new session, so we can't make it the foregroup process
+    // group.
+    TEST_PCHECK(ioctl(replica_.get(), TIOCSPGRP, &grandchild) &&
+                errno == EPERM);
 
-  EXPECT_THAT(WriteFd(sync_exit[1], &c, 1), SyscallSucceedsWithValue(1));
+    TEST_PCHECK(WriteFd(sync_exit[1], &c, 1) == 1);
 
-  int wstatus;
-  EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
-  EXPECT_TRUE(WIFEXITED(wstatus));
-  EXPECT_EQ(WEXITSTATUS(wstatus), 0);
+    int wstatus;
+    TEST_PCHECK(waitpid(grandchild, &wstatus, 0) == grandchild);
+    TEST_PCHECK(WIFEXITED(wstatus));
+    TEST_PCHECK(!WEXITSTATUS(wstatus));
+  });
+  ASSERT_NO_ERRNO(ret);
 }
 
 // Verify that we don't hang when creating a new session from an orphaned
diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc
index 1d7dbefdb..4ac648729 100644
--- a/test/syscalls/linux/pty_root.cc
+++ b/test/syscalls/linux/pty_root.cc
@@ -50,10 +50,10 @@ TEST(JobControlRootTest, StealTTY) {
 
   FileDescriptor master =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
-  FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+  FileDescriptor replica = ASSERT_NO_ERRNO_AND_VALUE(OpenReplica(master));
 
-  // Make slave the controlling terminal.
-  ASSERT_THAT(ioctl(slave.get(), TIOCSCTTY, 0), SyscallSucceeds());
+  // Make replica the controlling terminal.
+  ASSERT_THAT(ioctl(replica.get(), TIOCSCTTY, 0), SyscallSucceeds());
 
   // Fork, join a new session, and try to steal the parent's controlling
   // terminal, which should succeed when we have CAP_SYS_ADMIN and pass an arg
@@ -62,9 +62,9 @@ TEST(JobControlRootTest, StealTTY) {
   if (!child) {
     ASSERT_THAT(setsid(), SyscallSucceeds());
     // We shouldn't be able to steal the terminal with the wrong arg value.
-    TEST_PCHECK(ioctl(slave.get(), TIOCSCTTY, 0));
+    TEST_PCHECK(ioctl(replica.get(), TIOCSCTTY, 0));
     // We should be able to steal it if we are true root.
-    TEST_PCHECK(true_root == !ioctl(slave.get(), TIOCSCTTY, 1));
+    TEST_PCHECK(true_root == !ioctl(replica.get(), TIOCSCTTY, 1));
     _exit(0);
   }
 
diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc
index 8d6e5c913..54709371c 100644
--- a/test/syscalls/linux/raw_socket.cc
+++ b/test/syscalls/linux/raw_socket.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include <linux/capability.h>
-#ifndef __fuchsia__
 #include <linux/filter.h>
-#endif  // __fuchsia__
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
@@ -815,8 +813,6 @@ void RawSocketTest::ReceiveBufFrom(int sock, char* recv_buf,
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sock, recv_buf, recv_buf_len));
 }
 
-#ifndef __fuchsia__
-
 TEST_P(RawSocketTest, SetSocketDetachFilterNoInstalledFilter) {
   // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
   if (IsRunningOnGvisor()) {
@@ -838,8 +834,6 @@ TEST_P(RawSocketTest, GetSocketDetachFilter) {
               SyscallFailsWithErrno(ENOPROTOOPT));
 }
 
-#endif  //  __fuchsia__
-
 // AF_INET6+SOCK_RAW+IPPROTO_RAW sockets can be created, but not written to.
 TEST(RawSocketTest, IPv6ProtoRaw) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
index 3de898df7..bd779da92 100644
--- a/test/syscalls/linux/raw_socket_icmp.cc
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -416,6 +416,41 @@ TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) {
   ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
 }
 
+// Set and get SO_LINGER.
+TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int level = SOL_SOCKET;
+  int type = SO_LINGER;
+
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)),
+              SyscallSucceedsWithValue(0));
+
+  struct linger got_linger = {};
+  socklen_t length = sizeof(sl);
+  ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+}
+
+// Test getsockopt for SO_ACCEPTCONN.
+TEST_F(RawSocketICMPTest, GetSocketAcceptConn) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
+
 void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) {
   // We're going to receive both the echo request and reply, but the order is
   // indeterminate.
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index 833c0dc4f..5458f54ad 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -170,6 +170,9 @@ TEST(RenameTest, FileOverwritesFile) {
 }
 
 TEST(RenameTest, DirectoryOverwritesDirectoryLinkCount) {
+  // Directory link counts are synthetic on overlay filesystems.
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir())));
+
   auto parent1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2));
 
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 4bfb1ff56..94f9154a0 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -24,6 +24,7 @@
 #include "test/syscalls/linux/rseq/uapi.h"
 #include "test/util/logging.h"
 #include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -31,6 +32,9 @@ namespace testing {
 
 namespace {
 
+using ::testing::AnyOf;
+using ::testing::Eq;
+
 // Syscall test for rseq (restartable sequences).
 //
 // We must be very careful about how these tests are written. Each thread may
@@ -98,7 +102,7 @@ void RunChildTest(std::string test_case, int want_status) {
 
   int status = 0;
   ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
-  ASSERT_EQ(status, want_status);
+  ASSERT_THAT(status, AnyOf(Eq(want_status), Eq(128 + want_status)));
 }
 
 // Test that rseq must be aligned.
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
index f036db26d..6f5d38bba 100644
--- a/test/syscalls/linux/rseq/rseq.cc
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -74,84 +74,95 @@ int TestUnaligned() {
 // Sanity test that registration works.
 int TestRegister() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
   return 0;
-};
+}
 
 // Registration can't be done twice.
 int TestDoubleRegister() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+  ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != EBUSY) {
     return 1;
   }
 
   return 0;
-};
+}
 
 // Registration can be done again after unregister.
 int TestRegisterUnregister() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+
+  int ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
-  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
-      sys_errno(ret) != 0) {
+  ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+  ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
   return 0;
-};
+}
 
 // The pointer to rseq must match on register/unregister.
 int TestUnregisterDifferentPtr() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+
+  int ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
   struct rseq r2 = {};
-  if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
-      sys_errno(ret) != EINVAL) {
+
+  ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+  if (sys_errno(ret) != EINVAL) {
     return 1;
   }
 
   return 0;
-};
+}
 
 // The signature must match on register/unregister.
 int TestUnregisterDifferentSignature() {
   constexpr int kSignature = 0;
 
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kSignature);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
-  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
-      sys_errno(ret) != EPERM) {
+  ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+  if (sys_errno(ret) != EPERM) {
     return 1;
   }
 
   return 0;
-};
+}
 
 // The CPU ID is initialized.
 int TestCPU() {
   struct rseq r = {};
   r.cpu_id = kRseqCPUIDUninitialized;
 
-  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, 0);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -163,13 +174,13 @@ int TestCPU() {
   }
 
   return 0;
-};
+}
 
 // Critical section is eventually aborted.
 int TestAbort() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -185,13 +196,13 @@ int TestAbort() {
   rseq_loop(&r, &cs);
 
   return 0;
-};
+}
 
 // Abort may be before the critical section.
 int TestAbortBefore() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -207,13 +218,13 @@ int TestAbortBefore() {
   rseq_loop(&r, &cs);
 
   return 0;
-};
+}
 
 // Signature must match.
 int TestAbortSignature() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -229,13 +240,13 @@ int TestAbortSignature() {
   rseq_loop(&r, &cs);
 
   return 1;
-};
+}
 
 // Abort must not be in the critical section.
 int TestAbortPreCommit() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -251,13 +262,13 @@ int TestAbortPreCommit() {
   rseq_loop(&r, &cs);
 
   return 1;
-};
+}
 
 // rseq.rseq_cs is cleared on abort.
 int TestAbortClearsCS() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -277,13 +288,13 @@ int TestAbortClearsCS() {
   }
 
   return 0;
-};
+}
 
 // rseq.rseq_cs is cleared on abort outside of critical section.
 int TestInvalidAbortClearsCS() {
   struct rseq r = {};
-  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
-      sys_errno(ret) != 0) {
+  int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+  if (sys_errno(ret) != 0) {
     return 1;
   }
 
@@ -306,7 +317,7 @@ int TestInvalidAbortClearsCS() {
   }
 
   return 0;
-};
+}
 
 // Exit codes:
 //  0 - Pass
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
index 3b7bb74b1..ff0dd6e48 100644
--- a/test/syscalls/linux/rseq/test.h
+++ b/test/syscalls/linux/rseq/test.h
@@ -20,22 +20,20 @@ namespace testing {
 
 // Test cases supported by rseq binary.
 
-inline constexpr char kRseqTestUnaligned[] = "unaligned";
-inline constexpr char kRseqTestRegister[] = "register";
-inline constexpr char kRseqTestDoubleRegister[] = "double-register";
-inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
-inline constexpr char kRseqTestUnregisterDifferentPtr[] =
-    "unregister-different-ptr";
-inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+constexpr char kRseqTestUnaligned[] = "unaligned";
+constexpr char kRseqTestRegister[] = "register";
+constexpr char kRseqTestDoubleRegister[] = "double-register";
+constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+constexpr char kRseqTestUnregisterDifferentPtr[] = "unregister-different-ptr";
+constexpr char kRseqTestUnregisterDifferentSignature[] =
     "unregister-different-signature";
-inline constexpr char kRseqTestCPU[] = "cpu";
-inline constexpr char kRseqTestAbort[] = "abort";
-inline constexpr char kRseqTestAbortBefore[] = "abort-before";
-inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
-inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
-inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
-inline constexpr char kRseqTestInvalidAbortClearsCS[] =
-    "invalid-abort-clears-cs";
+constexpr char kRseqTestCPU[] = "cpu";
+constexpr char kRseqTestAbort[] = "abort";
+constexpr char kRseqTestAbortBefore[] = "abort-before";
+constexpr char kRseqTestAbortSignature[] = "abort-signature";
+constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+constexpr char kRseqTestInvalidAbortClearsCS[] = "invalid-abort-clears-cs";
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index e9b131ca9..ed6a1c2aa 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -486,6 +486,62 @@ TEST(SemaphoreTest, SemIpcSet) {
   ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EACCES));
 }
 
+TEST(SemaphoreTest, SemCtlIpcStat) {
+  // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
+  const uid_t kUid = getuid();
+  const gid_t kGid = getgid();
+  time_t start_time = time(nullptr);
+
+  AutoSem sem(semget(IPC_PRIVATE, 10, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  struct semid_ds ds;
+  EXPECT_THAT(semctl(sem.get(), 0, IPC_STAT, &ds), SyscallSucceeds());
+
+  EXPECT_EQ(ds.sem_perm.__key, IPC_PRIVATE);
+  EXPECT_EQ(ds.sem_perm.uid, kUid);
+  EXPECT_EQ(ds.sem_perm.gid, kGid);
+  EXPECT_EQ(ds.sem_perm.cuid, kUid);
+  EXPECT_EQ(ds.sem_perm.cgid, kGid);
+  EXPECT_EQ(ds.sem_perm.mode, 0600);
+  // Last semop time is not set on creation.
+  EXPECT_EQ(ds.sem_otime, 0);
+  EXPECT_GE(ds.sem_ctime, start_time);
+  EXPECT_EQ(ds.sem_nsems, 10);
+
+  // The timestamps only have a resolution of seconds; slow down so we actually
+  // see the timestamps change.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Set semid_ds structure of the set.
+  auto last_ctime = ds.sem_ctime;
+  start_time = time(nullptr);
+  struct semid_ds semid_to_set = {};
+  semid_to_set.sem_perm.uid = kUid;
+  semid_to_set.sem_perm.gid = kGid;
+  semid_to_set.sem_perm.mode = 0666;
+  ASSERT_THAT(semctl(sem.get(), 0, IPC_SET, &semid_to_set), SyscallSucceeds());
+  struct sembuf buf = {};
+  buf.sem_op = 1;
+  ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+  EXPECT_THAT(semctl(sem.get(), 0, IPC_STAT, &ds), SyscallSucceeds());
+  EXPECT_EQ(ds.sem_perm.mode, 0666);
+  EXPECT_GE(ds.sem_otime, start_time);
+  EXPECT_GT(ds.sem_ctime, last_ctime);
+
+  // An invalid semid fails the syscall with errno EINVAL.
+  EXPECT_THAT(semctl(sem.get() + 1, 0, IPC_STAT, &ds),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Make semaphore not readable and check the signal fails.
+  semid_to_set.sem_perm.mode = 0200;
+  ASSERT_THAT(semctl(sem.get(), 0, IPC_SET, &semid_to_set), SyscallSucceeds());
+  EXPECT_THAT(semctl(sem.get(), 0, IPC_STAT, &ds),
+              SyscallFailsWithErrno(EACCES));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 64123e904..a8bfb01f1 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -198,7 +198,39 @@ TEST(SendFileTest, SendAndUpdateFileOffset) {
   EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
             absl::string_view(actual, bytes_sent));
 
-  // Verify that the input file offset has been updated
+  // Verify that the input file offset has been updated.
+  ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
+              SyscallSucceedsWithValue(kHalfDataSize));
+  EXPECT_EQ(
+      absl::string_view(kData + kDataSize - bytes_sent, kDataSize - bytes_sent),
+      absl::string_view(actual, kHalfDataSize));
+}
+
+TEST(SendFileTest, SendToDevZeroAndUpdateFileOffset) {
+  // Create temp files.
+  // Test input string length must be > 2 AND even.
+  constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  constexpr int kHalfDataSize = kDataSize / 2;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open /dev/zero as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  int bytes_sent;
+  EXPECT_THAT(
+      bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+      SyscallSucceedsWithValue(kHalfDataSize));
+
+  char actual[kHalfDataSize];
+  // Verify that the input file offset has been updated.
   ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
               SyscallSucceedsWithValue(kHalfDataSize));
   EXPECT_EQ(
@@ -250,7 +282,7 @@ TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) {
   EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
             absl::string_view(actual, bytes_sent));
 
-  // Verify that the input file offset has been updated
+  // Verify that the input file offset has been updated.
   ASSERT_THAT(read(inf.get(), &actual, kQuarterDataSize),
               SyscallSucceedsWithValue(kQuarterDataSize));
 
@@ -501,6 +533,22 @@ TEST(SendFileTest, SendPipeWouldBlock) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
+TEST(SendFileTest, SendPipeEOF) {
+  // Create and open an empty input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Setup the output named pipe.
+  int fds[2];
+  ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, 123),
+              SyscallSucceedsWithValue(0));
+}
+
 TEST(SendFileTest, SendPipeBlocks) {
   // Create temp file.
   constexpr char kData[] =
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
index c20cd3fcc..e680d3dd7 100644
--- a/test/syscalls/linux/socket.cc
+++ b/test/syscalls/linux/socket.cc
@@ -14,6 +14,7 @@
 
 #include <sys/socket.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -26,6 +27,9 @@
 namespace gvisor {
 namespace testing {
 
+// From linux/magic.h, but we can't depend on linux headers here.
+#define SOCKFS_MAGIC 0x534F434B
+
 TEST(SocketTest, UnixSocketPairProtocol) {
   int socks[2];
   ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks),
@@ -94,6 +98,19 @@ TEST(SocketTest, UnixSocketStat) {
   }
 }
 
+TEST(SocketTest, UnixSocketStatFS) {
+  SKIP_IF(IsRunningWithVFS1());
+
+  FileDescriptor bound =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+  struct statfs st;
+  EXPECT_THAT(fstatfs(bound.get(), &st), SyscallSucceeds());
+  EXPECT_EQ(st.f_type, SOCKFS_MAGIC);
+  EXPECT_EQ(st.f_bsize, getpagesize());
+  EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
 using SocketOpenTest = ::testing::TestWithParam<int>;
 
 // UDS cannot be opened.
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
index 19239e9e9..6cd67123d 100644
--- a/test/syscalls/linux/socket_generic_stress.cc
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -30,6 +30,9 @@ namespace testing {
 using ConnectStressTest = SocketPairTest;
 
 TEST_P(ConnectStressTest, Reset65kTimes) {
+  // TODO(b/165912341): These are too slow on KVM platform with nested virt.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   for (int i = 0; i < 1 << 16; ++i) {
     auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -68,6 +71,9 @@ INSTANTIATE_TEST_SUITE_P(
 using PersistentListenerConnectStressTest = SocketPairTest;
 
 TEST_P(PersistentListenerConnectStressTest, 65kTimesShutdownCloseFirst) {
+  // TODO(b/165912341): These are too slow on KVM platform with nested virt.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   for (int i = 0; i < 1 << 16; ++i) {
     auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
     ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
@@ -87,6 +93,9 @@ TEST_P(PersistentListenerConnectStressTest, 65kTimesShutdownCloseFirst) {
 }
 
 TEST_P(PersistentListenerConnectStressTest, 65kTimesShutdownCloseSecond) {
+  // TODO(b/165912341): These are too slow on KVM platform with nested virt.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   for (int i = 0; i < 1 << 16; ++i) {
     auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
     ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RDWR), SyscallSucceeds());
@@ -106,6 +115,9 @@ TEST_P(PersistentListenerConnectStressTest, 65kTimesShutdownCloseSecond) {
 }
 
 TEST_P(PersistentListenerConnectStressTest, 65kTimesClose) {
+  // TODO(b/165912341): These are too slow on KVM platform with nested virt.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   for (int i = 0; i < 1 << 16; ++i) {
     auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   }
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index c3b42682f..39a68c5a5 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -97,11 +97,9 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
   ASSERT_THAT(socketpair(AF_INET6, 0, 0, fd),
               SyscallFailsWithErrno(ESOCKTNOSUPPORT));
 
-  // Invalid AF will return ENOAFSUPPORT.
-  ASSERT_THAT(socketpair(AF_MAX, 0, 0, fd),
-              SyscallFailsWithErrno(EAFNOSUPPORT));
-  ASSERT_THAT(socketpair(8675309, 0, 0, fd),
-              SyscallFailsWithErrno(EAFNOSUPPORT));
+  // Invalid AF will fail.
+  ASSERT_THAT(socketpair(AF_MAX, 0, 0, fd), SyscallFails());
+  ASSERT_THAT(socketpair(8675309, 0, 0, fd), SyscallFails());
 }
 
 enum class Operation {
@@ -116,7 +114,8 @@ std::string OperationToString(Operation operation) {
       return "Bind";
     case Operation::Connect:
       return "Connect";
-    case Operation::SendTo:
+    // Operation::SendTo is the default.
+    default:
       return "SendTo";
   }
 }
@@ -351,6 +350,10 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) {
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
 
+  // TODO(b/157236388): Remove Disable save after bug is fixed. S/R test can
+  // fail because the last socket may not be delivered to the accept queue
+  // by the time connect returns.
+  DisableSave ds;
   for (int i = 0; i < kBacklog; i++) {
     auto client = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -555,7 +558,11 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
   });
 }
 
-TEST_P(SocketInetLoopbackTest, TCPbacklog) {
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -568,7 +575,8 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
-  ASSERT_THAT(listen(listen_fd.get(), 2), SyscallSucceeds());
+  constexpr int kBacklogSize = 2;
+  ASSERT_THAT(listen(listen_fd.get(), kBacklogSize), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = listener.addr_len;
@@ -861,36 +869,38 @@ TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
               SyscallSucceedsWithValue(0));
 }
 
-// This test is disabled under random save as the the restore run
-// results in the stack.Seed() being different which can cause
-// sequence number of final connect to be one that is considered
-// old and can cause the test to be flaky.
-TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitTest_NoRandomSave) {
-  auto const& param = GetParam();
-  TestAddress const& listener = param.listener;
-  TestAddress const& connector = param.connector;
-
+// setupTimeWaitClose sets up a socket endpoint in TIME_WAIT state.
+// Callers can choose to perform active close on either ends of the connection
+// and also specify if they want to enabled SO_REUSEADDR.
+void setupTimeWaitClose(const TestAddress* listener,
+                        const TestAddress* connector, bool reuse,
+                        bool accept_close, sockaddr_storage* listen_addr,
+                        sockaddr_storage* conn_bound_addr) {
   // Create the listening socket.
-  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
-      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
-  sockaddr_storage listen_addr = listener.addr;
-  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                   listener.addr_len),
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener->family(), SOCK_STREAM, IPPROTO_TCP));
+  if (reuse) {
+    ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
+  }
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(listen_addr),
+                   listener->addr_len),
               SyscallSucceeds());
   ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
-  socklen_t addrlen = listener.addr_len;
+  socklen_t addrlen = listener->addr_len;
   ASSERT_THAT(getsockname(listen_fd.get(),
-                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+                          reinterpret_cast<sockaddr*>(listen_addr), &addrlen),
               SyscallSucceeds());
 
   uint16_t const port =
-      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener->family(), *listen_addr));
 
   // Connect to the listening socket.
   FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
-      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+      Socket(connector->family(), SOCK_STREAM, IPPROTO_TCP));
 
   // We disable saves after this point as a S/R causes the netstack seed
   // to be regenerated which changes what ports/ISN is picked for a given
@@ -901,11 +911,12 @@ TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitTest_NoRandomSave) {
   //
   // TODO(gvisor.dev/issue/940): S/R portSeed/portHint
   DisableSave ds;
-  sockaddr_storage conn_addr = connector.addr;
-  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  sockaddr_storage conn_addr = connector->addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector->family(), &conn_addr, port));
   ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
                                   reinterpret_cast<sockaddr*>(&conn_addr),
-                                  connector.addr_len),
+                                  connector->addr_len),
               SyscallSucceeds());
 
   // Accept the connection.
@@ -913,50 +924,150 @@ TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitTest_NoRandomSave) {
       ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
 
   // Get the address/port bound by the connecting socket.
-  sockaddr_storage conn_bound_addr;
-  socklen_t conn_addrlen = connector.addr_len;
+  socklen_t conn_addrlen = connector->addr_len;
   ASSERT_THAT(
-      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(conn_bound_addr),
                   &conn_addrlen),
       SyscallSucceeds());
 
-  // shutdown the accept FD to trigger TIME_WAIT on the accepted socket which
-  // should cause the conn_fd to follow CLOSE_WAIT->LAST_ACK->CLOSED instead of
-  // TIME_WAIT.
-  ASSERT_THAT(shutdown(accepted.get(), SHUT_RDWR), SyscallSucceeds());
+  FileDescriptor active_closefd, passive_closefd;
+  if (accept_close) {
+    active_closefd = std::move(accepted);
+    passive_closefd = std::move(conn_fd);
+  } else {
+    active_closefd = std::move(conn_fd);
+    passive_closefd = std::move(accepted);
+  }
+
+  // shutdown to trigger TIME_WAIT.
+  ASSERT_THAT(shutdown(active_closefd.get(), SHUT_RDWR), SyscallSucceeds());
   {
     const int kTimeout = 10000;
     struct pollfd pfd = {
-        .fd = conn_fd.get(),
+        .fd = passive_closefd.get(),
         .events = POLLIN,
     };
     ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
     ASSERT_EQ(pfd.revents, POLLIN);
   }
+  ScopedThread t([&]() {
+    constexpr int kTimeout = 10000;
+    constexpr int16_t want_events = POLLHUP;
+    struct pollfd pfd = {
+        .fd = active_closefd.get(),
+        .events = want_events,
+    };
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+  });
 
-  conn_fd.reset();
-  // This sleep is required to give conn_fd time to transition to TIME-WAIT.
+  passive_closefd.reset();
+  t.Join();
+  active_closefd.reset();
+  // This sleep is needed to reduce flake to ensure that the passive-close
+  // ensures the state transitions to CLOSE from LAST_ACK.
   absl::SleepFor(absl::Seconds(1));
+}
 
-  // At this point conn_fd should be the one that moved to CLOSE_WAIT and
-  // eventually to CLOSED.
+// These tests are disabled under random save as the the restore run
+// results in the stack.Seed() being different which can cause
+// sequence number of final connect to be one that is considered
+// old and can cause the test to be flaky.
+//
+// Test re-binding of client and server bound addresses when the older
+// connection is in TIME_WAIT.
+TEST_P(SocketInetLoopbackTest, TCPPassiveCloseNoTimeWaitTest_NoRandomSave) {
+  auto const& param = GetParam();
+  sockaddr_storage listen_addr, conn_bound_addr;
+  listen_addr = param.listener.addr;
+  setupTimeWaitClose(&param.listener, &param.connector, false /*reuse*/,
+                     true /*accept_close*/, &listen_addr, &conn_bound_addr);
 
-  // Now bind and connect a new socket and verify that we can immediately
-  // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
-  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
-      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  // Now bind a new socket and verify that we can immediately rebind the address
+  // bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(bind(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                   param.connector.addr_len),
+              SyscallSucceeds());
 
-  ASSERT_THAT(bind(conn_fd2.get(),
-                   reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   param.listener.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketInetLoopbackTest,
+       TCPPassiveCloseNoTimeWaitReuseTest_NoRandomSave) {
+  auto const& param = GetParam();
+  sockaddr_storage listen_addr, conn_bound_addr;
+  listen_addr = param.listener.addr;
+  setupTimeWaitClose(&param.listener, &param.connector, true /*reuse*/,
+                     true /*accept_close*/, &listen_addr, &conn_bound_addr);
+
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(setsockopt(listen_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
               SyscallSucceeds());
-  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   param.listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Now bind and connect  new socket and verify that we can immediately rebind
+  // the address bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                   param.connector.addr_len),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(param.listener.family(), listen_addr));
+  sockaddr_storage conn_addr = param.connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(param.connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
                                   reinterpret_cast<sockaddr*>(&conn_addr),
-                                  conn_addrlen),
+                                  param.connector.addr_len),
               SyscallSucceeds());
 }
 
 TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitTest_NoRandomSave) {
   auto const& param = GetParam();
+  sockaddr_storage listen_addr, conn_bound_addr;
+  listen_addr = param.listener.addr;
+  setupTimeWaitClose(&param.listener, &param.connector, false /*reuse*/,
+                     false /*accept_close*/, &listen_addr, &conn_bound_addr);
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(bind(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                   param.connector.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitReuseTest_NoRandomSave) {
+  auto const& param = GetParam();
+  sockaddr_storage listen_addr, conn_bound_addr;
+  listen_addr = param.listener.addr;
+  setupTimeWaitClose(&param.listener, &param.connector, true /*reuse*/,
+                     false /*accept_close*/, &listen_addr, &conn_bound_addr);
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(param.connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(setsockopt(conn_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                   param.connector.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+  auto const& param = GetParam();
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
@@ -975,23 +1086,19 @@ TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitTest_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16_t const port =
+  const uint16_t port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
+  // Set the userTimeout on the listening socket.
+  constexpr int kUserTimeout = 10;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kUserTimeout, sizeof(kUserTimeout)),
+              SyscallSucceeds());
+
   // Connect to the listening socket.
   FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
 
-  // We disable saves after this point as a S/R causes the netstack seed
-  // to be regenerated which changes what ports/ISN is picked for a given
-  // tuple (src ip,src port, dst ip, dst port). This can cause the final
-  // SYN to use a sequence number that looks like one from the current
-  // connection in TIME_WAIT and will not be accepted causing the test
-  // to timeout.
-  //
-  // TODO(gvisor.dev/issue/940): S/R portSeed/portHint
-  DisableSave ds;
-
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
@@ -1002,51 +1109,18 @@ TEST_P(SocketInetLoopbackTest, TCPActiveCloseTimeWaitTest_NoRandomSave) {
   // Accept the connection.
   auto accepted =
       ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
-
-  // Get the address/port bound by the connecting socket.
-  sockaddr_storage conn_bound_addr;
-  socklen_t conn_addrlen = connector.addr_len;
+  // Verify that the accepted socket inherited the user timeout set on
+  // listening socket.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
   ASSERT_THAT(
-      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
-                  &conn_addrlen),
+      getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
       SyscallSucceeds());
-
-  // shutdown the conn FD to trigger TIME_WAIT on the connect socket.
-  ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
-  {
-    const int kTimeout = 10000;
-    struct pollfd pfd = {
-        .fd = accepted.get(),
-        .events = POLLIN,
-    };
-    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
-    ASSERT_EQ(pfd.revents, POLLIN);
-  }
-  ScopedThread t([&]() {
-    constexpr int kTimeout = 10000;
-    constexpr int16_t want_events = POLLHUP;
-    struct pollfd pfd = {
-        .fd = conn_fd.get(),
-        .events = want_events,
-    };
-    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
-  });
-
-  accepted.reset();
-  t.Join();
-  conn_fd.reset();
-
-  // Now bind and connect a new socket and verify that we can't immediately
-  // rebind the address bound by the conn_fd as it is in TIME_WAIT.
-  conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
-      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
-
-  ASSERT_THAT(bind(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
-                   conn_addrlen),
-              SyscallFailsWithErrno(EADDRINUSE));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kUserTimeout);
 }
 
-TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) {
   auto const& param = GetParam();
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
@@ -1061,43 +1135,72 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
-  socklen_t addrlen = listener.addr_len;
-  ASSERT_THAT(getsockname(listen_fd.get(),
-                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
-              SyscallSucceeds());
+  {
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                    &addrlen),
+        SyscallSucceeds());
+  }
 
   const uint16_t port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-  // Set the userTimeout on the listening socket.
-  constexpr int kUserTimeout = 10;
-  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
-                         &kUserTimeout, sizeof(kUserTimeout)),
-              SyscallSucceeds());
-
   // Connect to the listening socket.
   FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
 
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  // TODO(b/157236388): Reenable Cooperative S/R once bug is fixed.
+  DisableSave ds;
   ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
                                   reinterpret_cast<sockaddr*>(&conn_addr),
                                   connector.addr_len),
               SyscallSucceeds());
 
-  // Accept the connection.
-  auto accepted =
-      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
-  // Verify that the accepted socket inherited the user timeout set on
-  // listening socket.
-  int get = -1;
-  socklen_t get_len = sizeof(get);
+  // Trigger a RST by turning linger off and closing the socket.
+  struct linger opt = {
+      .l_onoff = 1,
+      .l_linger = 0,
+  };
   ASSERT_THAT(
-      getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      setsockopt(conn_fd.get(), SOL_SOCKET, SO_LINGER, &opt, sizeof(opt)),
       SyscallSucceeds());
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kUserTimeout);
+  ASSERT_THAT(close(conn_fd.release()), SyscallSucceeds());
+
+  if (IsRunningOnGvisor()) {
+    // Gvisor packet procssing is asynchronous and can take a bit of time in
+    // some cases so we give it a bit of time to process the RST packet before
+    // calling accept.
+    //
+    // There is nothing to poll() on so we have no choice but to use a sleep
+    // here.
+    absl::SleepFor(absl::Milliseconds(100));
+  }
+
+  sockaddr_storage accept_addr;
+  socklen_t addrlen = sizeof(accept_addr);
+
+  auto accept_fd = ASSERT_NO_ERRNO_AND_VALUE(Accept(
+      listen_fd.get(), reinterpret_cast<sockaddr*>(&accept_addr), &addrlen));
+  ASSERT_EQ(addrlen, listener.addr_len);
+
+  // TODO(gvisor.dev/issue/3812): Remove after SO_ERROR is fixed.
+  if (IsRunningOnGvisor()) {
+    char buf[10];
+    ASSERT_THAT(ReadFd(accept_fd.get(), buf, sizeof(buf)),
+                SyscallFailsWithErrno(ECONNRESET));
+  } else {
+    int err;
+    socklen_t optlen = sizeof(err);
+    ASSERT_THAT(
+        getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+        SyscallSucceeds());
+    ASSERT_EQ(err, ECONNRESET);
+    ASSERT_EQ(optlen, sizeof(err));
+  }
 }
 
 // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
@@ -2573,6 +2676,44 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
       SyscallSucceeds());
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       MultipleBindsAllowedNoListeningReuseAddr) {
+  const auto& param = GetParam();
+  // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP
+  // this is only permitted if there is no other listening socket.
+  SKIP_IF(param.type != SOCK_STREAM);
+  // Bind the v4 loopback on a v4 socket.
+  const TestAddress& test_addr = V4Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Now create a socket and bind it to the same port, this should
+  // succeed since there is no listening socket for the same port.
+  FileDescriptor second_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(second_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(second_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
   auto const& param = GetParam();
   TestAddress const& test_addr = V4Loopback();
diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
index 791e2bd51..1a0b53394 100644
--- a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
+++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
@@ -168,6 +168,71 @@ INSTANTIATE_TEST_SUITE_P(
         TestParam{V6Loopback(), V6Loopback()}),
     DescribeTestParam);
 
+struct ProtocolTestParam {
+  std::string description;
+  int type;
+};
+
+std::string DescribeProtocolTestParam(
+    ::testing::TestParamInfo<ProtocolTestParam> const& info) {
+  return info.param.description;
+}
+
+using SocketMultiProtocolInetLoopbackTest =
+    ::testing::TestWithParam<ProtocolTestParam>;
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       BindAvoidsListeningPortsReuseAddr_NoRandomSave) {
+  const auto& param = GetParam();
+  // UDP sockets are allowed to bind/listen on the port w/ SO_REUSEADDR, for TCP
+  // this is only permitted if there is no other listening socket.
+  SKIP_IF(param.type != SOCK_STREAM);
+
+  DisableSave ds;  // Too many syscalls.
+
+  // A map of port to file descriptor binding the port.
+  std::map<uint16_t, FileDescriptor> listen_sockets;
+
+  // Exhaust all ephemeral ports.
+  while (true) {
+    // Bind the v4 loopback on a v4 socket.
+    TestAddress const& test_addr = V4Loopback();
+    sockaddr_storage bound_addr = test_addr.addr;
+    FileDescriptor bound_fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+    ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
+
+    int ret = bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len);
+    if (ret != 0) {
+      ASSERT_EQ(errno, EADDRINUSE);
+      break;
+    }
+    // Get the port that we bound.
+    socklen_t bound_addr_len = test_addr.addr_len;
+    ASSERT_THAT(
+        getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                    &bound_addr_len),
+        SyscallSucceeds());
+    uint16_t port = reinterpret_cast<sockaddr_in*>(&bound_addr)->sin_port;
+
+    // Newly bound port should not already be in use by a listening socket.
+    ASSERT_EQ(listen_sockets.find(port), listen_sockets.end());
+    auto fd = bound_fd.get();
+    listen_sockets.insert(std::make_pair(port, std::move(bound_fd)));
+    ASSERT_THAT(listen(fd, SOMAXCONN), SyscallSucceeds());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllFamilies, SocketMultiProtocolInetLoopbackTest,
+    ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
+                      ProtocolTestParam{"UDP", SOCK_DGRAM}),
+    DescribeProtocolTestParam);
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 53c076787..831d96262 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_tcp_generic.h"
 
+#include <fcntl.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -819,18 +820,37 @@ TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) {
   EXPECT_EQ(get, kDefaultTCPLingerTimeout);
 }
 
-TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) {
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutLessThanZero) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  constexpr int kZero = 0;
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
-                         sizeof(kZero)),
-              SyscallSucceedsWithValue(0));
-
   constexpr int kNegative = -1234;
   EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
                          &kNegative, sizeof(kNegative)),
               SyscallSucceedsWithValue(0));
+  int get = INT_MAX;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, -1);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
+                         sizeof(kZero)),
+              SyscallSucceedsWithValue(0));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_THAT(get,
+              AnyOf(Eq(kMaxTCPLingerTimeout), Eq(kOldMaxTCPLingerTimeout)));
 }
 
 TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveMax) {
@@ -960,6 +980,56 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+#ifdef __linux__
+TEST_P(TCPSocketPairTest, SpliceFromPipe) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize / 2);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, sockets->first_fd(), nullptr, kPageSize, 0),
+      SyscallSucceedsWithValue(buf.size()));
+
+  std::vector<char> rbuf(buf.size());
+  ASSERT_THAT(read(sockets->second_fd(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+TEST_P(TCPSocketPairTest, SpliceToPipe) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize / 2);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(sockets->first_fd(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  shutdown(sockets->first_fd(), SHUT_WR);
+  EXPECT_THAT(
+      splice(sockets->second_fd(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+      SyscallSucceedsWithValue(buf.size()));
+
+  std::vector<char> rbuf(buf.size());
+  ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+#endif  // __linux__
+
 TEST_P(TCPSocketPairTest, SetTCPWindowClampBelowMinRcvBufConnectedSocket) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   // Discover minimum receive buf by setting a really low value
@@ -1061,5 +1131,124 @@ TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
   }
 }
 
+// Test setsockopt and getsockopt for a socket with SO_LINGER option.
+TEST_P(TCPSocketPairTest, SetAndGetLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Check getsockopt before SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_THAT(got_len, sizeof(got_linger));
+  struct linger want_linger = {};
+  EXPECT_EQ(0, memcmp(&want_linger, &got_linger, got_len));
+
+  // Set and get SO_LINGER with negative values.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = -3;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(sl.l_onoff, got_linger.l_onoff);
+  // Linux returns a different value as it uses HZ to convert the seconds to
+  // jiffies which overflows for negative values. We want to be compatible with
+  // linux for getsockopt return value.
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(sl.l_linger, got_linger.l_linger);
+  }
+
+  // Set and get SO_LINGER option with positive values.
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+}
+
+// Test socket to disable SO_LINGER option.
+TEST_P(TCPSocketPairTest, SetOffLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the SO_LINGER option.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+
+  sl.l_onoff = 0;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set to zero.
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+}
+
+// Test close on dup'd socket with SO_LINGER option set.
+TEST_P(TCPSocketPairTest, CloseWithLingerOption) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the SO_LINGER option.
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceeds());
+
+  // Check getsockopt after SO_LINGER option is set.
+  struct linger got_linger = {-1, -1};
+  socklen_t got_len = sizeof(got_linger);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &got_len),
+              SyscallSucceeds());
+  ASSERT_EQ(got_len, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, got_len));
+
+  FileDescriptor dupFd = FileDescriptor(dup(sockets->first_fd()));
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+  char buf[10] = {};
+  // Write on dupFd should succeed as socket will not be closed until
+  // all references are removed.
+  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EBADF));
+
+  // Close the socket.
+  dupFd.reset();
+  // Write on dupFd should fail as all references for socket are removed.
+  ASSERT_THAT(RetryEINTR(write)(dupFd.get(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EBADF));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index edb86aded..f69f8f99f 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -435,8 +435,10 @@ TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
 
 // Test that an IPv4 socket does not support the IPv6 TClass option.
 TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
-  // This should only test AF_INET sockets for the mismatch behavior.
-  SKIP_IF(GetParam().domain != AF_INET);
+  // This should only test AF_INET6 sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET6);
+  // IPV6_RECVTCLASS is only valid for SOCK_DGRAM and SOCK_RAW.
+  SKIP_IF(GetParam().type != SOCK_DGRAM | GetParam().type != SOCK_RAW);
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -448,5 +450,41 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
+// Test the SO_LINGER option can be set/get on udp socket.
+TEST_P(UDPSocketPairTest, SetAndGetSocketLinger) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  int level = SOL_SOCKET;
+  int type = SO_LINGER;
+
+  struct linger sl;
+  sl.l_onoff = 1;
+  sl.l_linger = 5;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &sl, sizeof(sl)),
+              SyscallSucceedsWithValue(0));
+
+  struct linger got_linger = {};
+  socklen_t length = sizeof(sl);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), level, type, &got_linger, &length),
+      SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+}
+
+// Test getsockopt for SO_ACCEPTCONN on udp socket.
+TEST_P(UDPSocketPairTest, GetSocketAcceptConn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+      SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index bc005e2bb..b3f54e7f6 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -27,6 +27,8 @@
 #include "absl/memory/memory.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -73,9 +75,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  EXPECT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 // Check that not setting a default send interface prevents multicast packets
@@ -207,8 +209,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddr) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -262,8 +265,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNic) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -317,8 +321,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddr) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -372,8 +377,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -431,8 +437,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrConnect) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -490,8 +497,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -545,8 +553,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelf) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -600,8 +609,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -659,9 +669,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfConnect) {
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  EXPECT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 // Check that multicast works when the default send interface is configured by
@@ -717,9 +727,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) {
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  EXPECT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 // Check that multicast works when the default send interface is configured by
@@ -775,8 +785,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -834,8 +845,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -907,9 +919,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropAddr) {
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  EXPECT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 // Check that dropping a group membership prevents multicast packets from being
@@ -965,9 +977,9 @@ TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) {
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  EXPECT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfZero) {
@@ -1319,9 +1331,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionOnTwoSockets) {
     // Check that we received the multicast packet on both sockets.
     for (auto& sockets : socket_pairs) {
       char recv_buf[sizeof(send_buf)] = {};
-      ASSERT_THAT(
-          RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-          SyscallSucceedsWithValue(sizeof(recv_buf)));
+      ASSERT_THAT(RecvTimeout(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                              1 /*timeout*/),
+                  IsPosixErrorOkAndHolds(sizeof(recv_buf)));
       EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
     }
   }
@@ -1398,9 +1410,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionWhenDroppingMemberships) {
     // Check that we received the multicast packet on both sockets.
     for (auto& sockets : socket_pairs) {
       char recv_buf[sizeof(send_buf)] = {};
-      ASSERT_THAT(
-          RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-          SyscallSucceedsWithValue(sizeof(recv_buf)));
+      ASSERT_THAT(RecvTimeout(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                              1 /*timeout*/),
+                  IsPosixErrorOkAndHolds(sizeof(recv_buf)));
       EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
     }
   }
@@ -1421,9 +1433,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionWhenDroppingMemberships) {
 
     char recv_buf[sizeof(send_buf)] = {};
     for (auto& sockets : socket_pairs) {
-      ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf,
-                                   sizeof(recv_buf), MSG_DONTWAIT),
-                  SyscallFailsWithErrno(EAGAIN));
+      ASSERT_THAT(RecvTimeout(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                              1 /*timeout*/),
+                  PosixErrorIs(EAGAIN, ::testing::_));
     }
   }
 }
@@ -1474,9 +1486,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenJoinThenReceive) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
@@ -1518,9 +1530,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenNoJoinThenNoReceive) {
 
   // Check that we don't receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      PosixErrorIs(EAGAIN, ::testing::_));
 }
 
 // Check that a socket can bind to a multicast address and still send out
@@ -1568,9 +1580,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenSend) {
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
@@ -1615,9 +1627,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenReceive) {
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
@@ -1666,9 +1678,9 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenSend) {
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(
+      RecvTimeout(socket2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
@@ -1726,17 +1738,17 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution_NoRandomSave) {
     // of the other sockets to have received it, but we will check that later.
     char recv_buf[sizeof(send_buf)] = {};
     EXPECT_THAT(
-        RetryEINTR(recv)(last->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
-        SyscallSucceedsWithValue(sizeof(send_buf)));
+        RecvTimeout(last->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+        IsPosixErrorOkAndHolds(sizeof(send_buf)));
     EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
   }
 
   // Verify that no other messages were received.
   for (auto& socket : sockets) {
     char recv_buf[kMessageSize] = {};
-    EXPECT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf),
-                                 MSG_DONTWAIT),
-                SyscallFailsWithErrno(EAGAIN));
+    EXPECT_THAT(
+        RecvTimeout(socket->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+        PosixErrorIs(EAGAIN, ::testing::_));
   }
 }
 
@@ -2097,6 +2109,9 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
 
   constexpr int kMessageSize = 10;
 
+  // Saving during each iteration of the following loop is too expensive.
+  DisableSave ds;
+
   for (int i = 0; i < 100; ++i) {
     // Send a new message to the REUSEADDR/REUSEPORT group. We use a new socket
     // each time so that a new ephemerial port will be used each time. This
@@ -2109,49 +2124,18 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
                 SyscallSucceedsWithValue(sizeof(send_buf)));
   }
 
+  ds.reset();
+
   // Check that both receivers got messages. This checks that we are using load
   // balancing (REUSEPORT) instead of the most recently bound socket
   // (REUSEADDR).
   char recv_buf[kMessageSize] = {};
-  EXPECT_THAT(RetryEINTR(recv)(receiver1->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(kMessageSize));
-  EXPECT_THAT(RetryEINTR(recv)(receiver2->get(), recv_buf, sizeof(recv_buf),
-                               MSG_DONTWAIT),
-              SyscallSucceedsWithValue(kMessageSize));
-}
-
-// Check that connect returns EADDRNOTAVAIL when out of local ephemeral ports.
-// We disable S/R because this test creates a large number of sockets.
-TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
-  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-  constexpr int kClients = 65536;
-  // Bind the first socket to the loopback and take note of the selected port.
-  auto addr = V4Loopback();
-  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
-                   addr.addr_len),
-              SyscallSucceeds());
-  socklen_t addr_len = addr.addr_len;
-  ASSERT_THAT(getsockname(receiver1->get(),
-                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
-              SyscallSucceeds());
-  EXPECT_EQ(addr_len, addr.addr_len);
-
-  // Disable cooperative S/R as we are making too many syscalls.
-  DisableSave ds;
-  std::vector<std::unique_ptr<FileDescriptor>> sockets;
-  for (int i = 0; i < kClients; i++) {
-    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-
-    int ret = connect(s->get(), reinterpret_cast<sockaddr*>(&addr.addr),
-                      addr.addr_len);
-    if (ret == 0) {
-      sockets.push_back(std::move(s));
-      continue;
-    }
-    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
-    break;
-  }
+  EXPECT_THAT(
+      RecvTimeout(receiver1->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(kMessageSize));
+  EXPECT_THAT(
+      RecvTimeout(receiver2->get(), recv_buf, sizeof(recv_buf), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(kMessageSize));
 }
 
 // Test that socket will receive packet info control message.
@@ -2215,8 +2199,8 @@ TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
   received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
   received_msg.msg_control = received_cmsg_buf;
 
-  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
-              SyscallSucceedsWithValue(kDataLength));
+  ASSERT_THAT(RecvMsgTimeout(receiver->get(), &received_msg, 1 /*timeout*/),
+              IsPosixErrorOkAndHolds(kDataLength));
 
   cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
   ASSERT_NE(cmsg, nullptr);
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc
new file mode 100644
index 000000000..8052bf404
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_netlink.cc
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+INSTANTIATE_TEST_SUITE_P(
+    IPv4UDPSockets, IPv4UDPUnboundSocketNetlinkTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc
new file mode 100644
index 000000000..bcbd2feac
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback_nogotsan.cc
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketNogotsanTest = SimpleSocketTest;
+
+// Check that connect returns EAGAIN when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketNogotsanTest,
+       UDPConnectPortExhaustion_NoRandomSave) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  constexpr int kClients = 65536;
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(receiver1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  for (int i = 0; i < kClients; i++) {
+    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+    int ret = connect(s->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                      addr.addr_len);
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+    break;
+  }
+}
+
+// Check that bind returns EADDRINUSE when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketNogotsanTest, UDPBindPortExhaustion_NoRandomSave) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  constexpr int kClients = 65536;
+  auto addr = V4Loopback();
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  for (int i = 0; i < kClients; i++) {
+    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+    int ret =
+        bind(s->get(), reinterpret_cast<sockaddr*>(&addr.addr), addr.addr_len);
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRINUSE));
+    break;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    IPv4UDPSockets, IPv4UDPUnboundSocketNogotsanTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
new file mode 100644
index 000000000..875016812
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.cc
@@ -0,0 +1,209 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h"
+
+#include <arpa/inet.h>
+#include <poll.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+
+namespace gvisor {
+namespace testing {
+
+constexpr size_t kSendBufSize = 200;
+
+// Checks that the loopback interface considers itself bound to all IPs in an
+// associated subnet.
+TEST_P(IPv4UDPUnboundSocketNetlinkTest, JoinSubnet) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Add an IP address to the loopback interface.
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  struct in_addr addr;
+  ASSERT_EQ(1, inet_pton(AF_INET, "192.0.2.1", &addr));
+  ASSERT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET,
+                                   /*prefixlen=*/24, &addr, sizeof(addr)));
+  Cleanup defer_addr_removal = Cleanup(
+      [loopback_link = std::move(loopback_link), addr = std::move(addr)] {
+        EXPECT_NO_ERRNO(LinkDelLocalAddr(loopback_link.index, AF_INET,
+                                         /*prefixlen=*/24, &addr,
+                                         sizeof(addr)));
+      });
+
+  auto snd_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcv_sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Send from an unassigned address but an address that is in the subnet
+  // associated with the loopback interface.
+  TestAddress sender_addr("V4NotAssignd1");
+  sender_addr.addr.ss_family = AF_INET;
+  sender_addr.addr_len = sizeof(sockaddr_in);
+  ASSERT_EQ(1, inet_pton(AF_INET, "192.0.2.2",
+                         &(reinterpret_cast<sockaddr_in*>(&sender_addr.addr)
+                               ->sin_addr.s_addr)));
+  ASSERT_THAT(
+      bind(snd_sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+
+  // Send the packet to an unassigned address but an address that is in the
+  // subnet associated with the loopback interface.
+  TestAddress receiver_addr("V4NotAssigned2");
+  receiver_addr.addr.ss_family = AF_INET;
+  receiver_addr.addr_len = sizeof(sockaddr_in);
+  ASSERT_EQ(1, inet_pton(AF_INET, "192.0.2.254",
+                         &(reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)
+                               ->sin_addr.s_addr)));
+  ASSERT_THAT(
+      bind(rcv_sock->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(rcv_sock->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  ASSERT_EQ(receiver_addr_len, receiver_addr.addr_len);
+  char send_buf[kSendBufSize];
+  RandomizeBuffer(send_buf, kSendBufSize);
+  ASSERT_THAT(
+      RetryEINTR(sendto)(snd_sock->get(), send_buf, kSendBufSize, 0,
+                         reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                         receiver_addr.addr_len),
+      SyscallSucceedsWithValue(kSendBufSize));
+
+  // Check that we received the packet.
+  char recv_buf[kSendBufSize] = {};
+  ASSERT_THAT(RetryEINTR(recv)(rcv_sock->get(), recv_buf, kSendBufSize, 0),
+              SyscallSucceedsWithValue(kSendBufSize));
+  ASSERT_EQ(0, memcmp(send_buf, recv_buf, kSendBufSize));
+}
+
+// Tests that broadcast packets are delivered to all interested sockets
+// (wildcard and broadcast address specified sockets).
+//
+// Note, we cannot test the IPv4 Broadcast (255.255.255.255) because we do
+// not have a route to it.
+TEST_P(IPv4UDPUnboundSocketNetlinkTest, ReuseAddrSubnetDirectedBroadcast) {
+  constexpr uint16_t kPort = 9876;
+  // Wait up to 20 seconds for the data.
+  constexpr int kPollTimeoutMs = 20000;
+  // Number of sockets per socket type.
+  constexpr int kNumSocketsPerType = 2;
+
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Add an IP address to the loopback interface.
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  struct in_addr addr;
+  ASSERT_EQ(1, inet_pton(AF_INET, "192.0.2.1", &addr));
+  ASSERT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET,
+                                   24 /* prefixlen */, &addr, sizeof(addr)));
+  Cleanup defer_addr_removal = Cleanup(
+      [loopback_link = std::move(loopback_link), addr = std::move(addr)] {
+        EXPECT_NO_ERRNO(LinkDelLocalAddr(loopback_link.index, AF_INET,
+                                         /*prefixlen=*/24, &addr,
+                                         sizeof(addr)));
+      });
+
+  TestAddress broadcast_address("SubnetBroadcastAddress");
+  broadcast_address.addr.ss_family = AF_INET;
+  broadcast_address.addr_len = sizeof(sockaddr_in);
+  auto broadcast_address_in =
+      reinterpret_cast<sockaddr_in*>(&broadcast_address.addr);
+  ASSERT_EQ(1, inet_pton(AF_INET, "192.0.2.255",
+                         &broadcast_address_in->sin_addr.s_addr));
+  broadcast_address_in->sin_port = htons(kPort);
+
+  TestAddress any_address = V4Any();
+  reinterpret_cast<sockaddr_in*>(&any_address.addr)->sin_port = htons(kPort);
+
+  // We create sockets bound to both the wildcard address and the broadcast
+  // address to make sure both of these types of "broadcast interested" sockets
+  // receive broadcast packets.
+  std::vector<std::unique_ptr<FileDescriptor>> socks;
+  for (bool bind_wildcard : {false, true}) {
+    // Create multiple sockets for each type of "broadcast interested"
+    // socket so we can test that all sockets receive the broadcast packet.
+    for (int i = 0; i < kNumSocketsPerType; i++) {
+      auto sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+      auto idx = socks.size();
+
+      ASSERT_THAT(setsockopt(sock->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0))
+          << "socks[" << idx << "]";
+
+      ASSERT_THAT(setsockopt(sock->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0))
+          << "socks[" << idx << "]";
+
+      if (bind_wildcard) {
+        ASSERT_THAT(
+            bind(sock->get(), reinterpret_cast<sockaddr*>(&any_address.addr),
+                 any_address.addr_len),
+            SyscallSucceeds())
+            << "socks[" << idx << "]";
+      } else {
+        ASSERT_THAT(bind(sock->get(),
+                         reinterpret_cast<sockaddr*>(&broadcast_address.addr),
+                         broadcast_address.addr_len),
+                    SyscallSucceeds())
+            << "socks[" << idx << "]";
+      }
+
+      socks.push_back(std::move(sock));
+    }
+  }
+
+  char send_buf[kSendBufSize];
+  RandomizeBuffer(send_buf, kSendBufSize);
+
+  // Broadcasts from each socket should be received by every socket (including
+  // the sending socket).
+  for (int w = 0; w < socks.size(); w++) {
+    auto& w_sock = socks[w];
+    ASSERT_THAT(
+        RetryEINTR(sendto)(w_sock->get(), send_buf, kSendBufSize, 0,
+                           reinterpret_cast<sockaddr*>(&broadcast_address.addr),
+                           broadcast_address.addr_len),
+        SyscallSucceedsWithValue(kSendBufSize))
+        << "write socks[" << w << "]";
+
+    // Check that we received the packet on all sockets.
+    for (int r = 0; r < socks.size(); r++) {
+      auto& r_sock = socks[r];
+
+      struct pollfd poll_fd = {r_sock->get(), POLLIN, 0};
+      EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+                  SyscallSucceedsWithValue(1))
+          << "write socks[" << w << "] & read socks[" << r << "]";
+
+      char recv_buf[kSendBufSize] = {};
+      EXPECT_THAT(RetryEINTR(recv)(r_sock->get(), recv_buf, kSendBufSize, 0),
+                  SyscallSucceedsWithValue(kSendBufSize))
+          << "write socks[" << w << "] & read socks[" << r << "]";
+      EXPECT_EQ(0, memcmp(send_buf, recv_buf, kSendBufSize))
+          << "write socks[" << w << "] & read socks[" << r << "]";
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
new file mode 100644
index 000000000..73e7836d5
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_netlink.h
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketNetlinkTest = SimpleSocketTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc
new file mode 100644
index 000000000..17021ff82
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_loopback_netlink.cc
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+INSTANTIATE_TEST_SUITE_P(
+    IPv6UDPSockets, IPv6UDPUnboundSocketNetlinkTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv6UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
new file mode 100644
index 000000000..2ee218231
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h"
+
+#include <arpa/inet.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/util/capability_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Checks that the loopback interface does not consider itself bound to all IPs
+// in an associated subnet.
+TEST_P(IPv6UDPUnboundSocketNetlinkTest, JoinSubnet) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Add an IP address to the loopback interface.
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  struct in6_addr addr;
+  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::1", &addr));
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET6,
+                                   /*prefixlen=*/64, &addr, sizeof(addr)));
+
+  // Binding to an unassigned address but an address that is in the subnet
+  // associated with the loopback interface should fail.
+  TestAddress sender_addr("V6NotAssignd1");
+  sender_addr.addr.ss_family = AF_INET6;
+  sender_addr.addr_len = sizeof(sockaddr_in6);
+  EXPECT_EQ(1, inet_pton(AF_INET6, "2001:db8::2",
+                         reinterpret_cast<sockaddr_in6*>(&sender_addr.addr)
+                             ->sin6_addr.s6_addr));
+  auto sock = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  EXPECT_THAT(bind(sock->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                   sender_addr.addr_len),
+              SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
new file mode 100644
index 000000000..88098be82
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv6_udp_unbound_netlink.h
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv6 UDP sockets.
+using IPv6UDPUnboundSocketNetlinkTest = SimpleSocketTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV6_UDP_UNBOUND_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index b3fcf8e7c..e83f0d81f 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <fcntl.h>
 #include <ifaddrs.h>
 #include <linux/if.h>
 #include <linux/netlink.h>
@@ -335,6 +336,49 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
   EXPECT_EQ((msg.msg_flags & MSG_TRUNC), MSG_TRUNC);
 }
 
+TEST(NetlinkRouteTest, SpliceFromPipe) {
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link.index;
+
+  ASSERT_THAT(write(wfd.get(), &req, sizeof(req)),
+              SyscallSucceedsWithValue(sizeof(req)));
+
+  EXPECT_THAT(splice(rfd.get(), nullptr, fd.get(), nullptr, sizeof(req) + 1, 0),
+              SyscallSucceedsWithValue(sizeof(req)));
+  close(wfd.release());
+  EXPECT_THAT(splice(rfd.get(), nullptr, fd.get(), nullptr, sizeof(req) + 1, 0),
+              SyscallSucceedsWithValue(0));
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkResponse(
+      fd,
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
 TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -467,53 +511,42 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
-TEST(NetlinkRouteTest, AddAddr) {
+TEST(NetlinkRouteTest, AddAndRemoveAddr) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+  // Don't do cooperative save/restore because netstack state is not restored.
+  // TODO(gvisor.dev/issue/4595): enable cooperative save tests.
+  const DisableSave ds;
 
   Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-
-  struct request {
-    struct nlmsghdr hdr;
-    struct ifaddrmsg ifa;
-    struct rtattr rtattr;
-    struct in_addr addr;
-    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
-  };
-
-  struct request req = {};
-  req.hdr.nlmsg_type = RTM_NEWADDR;
-  req.hdr.nlmsg_seq = kSeq;
-  req.ifa.ifa_family = AF_INET;
-  req.ifa.ifa_prefixlen = 24;
-  req.ifa.ifa_flags = 0;
-  req.ifa.ifa_scope = 0;
-  req.ifa.ifa_index = loopback_link.index;
-  req.rtattr.rta_type = IFA_LOCAL;
-  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
-  inet_pton(AF_INET, "10.0.0.1", &req.addr);
-  req.hdr.nlmsg_len =
-      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+  struct in_addr addr;
+  ASSERT_EQ(inet_pton(AF_INET, "10.0.0.1", &addr), 1);
 
   // Create should succeed, as no such address in kernel.
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
-  EXPECT_NO_ERRNO(
-      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+  ASSERT_NO_ERRNO(LinkAddLocalAddr(loopback_link.index, AF_INET,
+                                   /*prefixlen=*/24, &addr, sizeof(addr)));
+
+  Cleanup defer_addr_removal = Cleanup(
+      [loopback_link = std::move(loopback_link), addr = std::move(addr)] {
+        // First delete should succeed, as address exists.
+        EXPECT_NO_ERRNO(LinkDelLocalAddr(loopback_link.index, AF_INET,
+                                         /*prefixlen=*/24, &addr,
+                                         sizeof(addr)));
+
+        // Second delete should fail, as address no longer exists.
+        EXPECT_THAT(LinkDelLocalAddr(loopback_link.index, AF_INET,
+                                     /*prefixlen=*/24, &addr, sizeof(addr)),
+                    PosixErrorIs(EINVAL, ::testing::_));
+      });
 
   // Replace an existing address should succeed.
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
-  req.hdr.nlmsg_seq++;
-  EXPECT_NO_ERRNO(
-      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+  ASSERT_NO_ERRNO(LinkReplaceLocalAddr(loopback_link.index, AF_INET,
+                                       /*prefixlen=*/24, &addr, sizeof(addr)));
 
   // Create exclusive should fail, as we created the address above.
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
-  req.hdr.nlmsg_seq++;
-  EXPECT_THAT(
-      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
-      PosixErrorIs(EEXIST, ::testing::_));
+  EXPECT_THAT(LinkAddExclusiveLocalAddr(loopback_link.index, AF_INET,
+                                        /*prefixlen=*/24, &addr, sizeof(addr)),
+              PosixErrorIs(EEXIST, ::testing::_));
 }
 
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
index bde1dbb4d..46f749c7c 100644
--- a/test/syscalls/linux/socket_netlink_route_util.cc
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -26,6 +26,72 @@ namespace {
 
 constexpr uint32_t kSeq = 12345;
 
+// Types of address modifications that may be performed on an interface.
+enum class LinkAddrModification {
+  kAdd,
+  kAddExclusive,
+  kReplace,
+  kDelete,
+};
+
+// Populates |hdr| with appripriate values for the modification type.
+PosixError PopulateNlmsghdr(LinkAddrModification modification,
+                            struct nlmsghdr* hdr) {
+  switch (modification) {
+    case LinkAddrModification::kAdd:
+      hdr->nlmsg_type = RTM_NEWADDR;
+      hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+      return NoError();
+    case LinkAddrModification::kAddExclusive:
+      hdr->nlmsg_type = RTM_NEWADDR;
+      hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_EXCL | NLM_F_ACK;
+      return NoError();
+    case LinkAddrModification::kReplace:
+      hdr->nlmsg_type = RTM_NEWADDR;
+      hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+      return NoError();
+    case LinkAddrModification::kDelete:
+      hdr->nlmsg_type = RTM_DELADDR;
+      hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+      return NoError();
+  }
+
+  return PosixError(EINVAL);
+}
+
+// Adds or removes the specified address from the specified interface.
+PosixError LinkModifyLocalAddr(int index, int family, int prefixlen,
+                               const void* addr, int addrlen,
+                               LinkAddrModification modification) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  PosixError err = PopulateNlmsghdr(modification, &req.hdr);
+  if (!err.ok()) {
+    return err;
+  }
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
 }  // namespace
 
 PosixError DumpLinks(
@@ -84,31 +150,26 @@ PosixErrorOr<Link> LoopbackLink() {
 
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
                             const void* addr, int addrlen) {
-  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
-
-  struct request {
-    struct nlmsghdr hdr;
-    struct ifaddrmsg ifaddr;
-    char attrbuf[512];
-  };
+  return LinkModifyLocalAddr(index, family, prefixlen, addr, addrlen,
+                             LinkAddrModification::kAdd);
+}
 
-  struct request req = {};
-  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
-  req.hdr.nlmsg_type = RTM_NEWADDR;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-  req.hdr.nlmsg_seq = kSeq;
-  req.ifaddr.ifa_index = index;
-  req.ifaddr.ifa_family = family;
-  req.ifaddr.ifa_prefixlen = prefixlen;
+PosixError LinkAddExclusiveLocalAddr(int index, int family, int prefixlen,
+                                     const void* addr, int addrlen) {
+  return LinkModifyLocalAddr(index, family, prefixlen, addr, addrlen,
+                             LinkAddrModification::kAddExclusive);
+}
 
-  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
-      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
-  rta->rta_type = IFA_LOCAL;
-  rta->rta_len = RTA_LENGTH(addrlen);
-  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
-  memcpy(RTA_DATA(rta), addr, addrlen);
+PosixError LinkReplaceLocalAddr(int index, int family, int prefixlen,
+                                const void* addr, int addrlen) {
+  return LinkModifyLocalAddr(index, family, prefixlen, addr, addrlen,
+                             LinkAddrModification::kReplace);
+}
 
-  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+PosixError LinkDelLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  return LinkModifyLocalAddr(index, family, prefixlen, addr, addrlen,
+                             LinkAddrModification::kDelete);
 }
 
 PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
index 149c4a7f6..eaa91ad79 100644
--- a/test/syscalls/linux/socket_netlink_route_util.h
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -39,10 +39,23 @@ PosixErrorOr<std::vector<Link>> DumpLinks();
 // Returns the loopback link on the system. ENOENT if not found.
 PosixErrorOr<Link> LoopbackLink();
 
-// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+// LinkAddLocalAddr adds a new IFA_LOCAL address to the interface.
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
                             const void* addr, int addrlen);
 
+// LinkAddExclusiveLocalAddr adds a new IFA_LOCAL address with NLM_F_EXCL flag
+// to the interface.
+PosixError LinkAddExclusiveLocalAddr(int index, int family, int prefixlen,
+                                     const void* addr, int addrlen);
+
+// LinkReplaceLocalAddr replaces an IFA_LOCAL address on the interface.
+PosixError LinkReplaceLocalAddr(int index, int family, int prefixlen,
+                                const void* addr, int addrlen);
+
+// LinkDelLocalAddr removes IFA_LOCAL attribute on the interface.
+PosixError LinkDelLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
 // LinkChangeFlags changes interface flags. E.g. IFF_UP.
 PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
 
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 952eecfe8..bdebea321 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -67,10 +67,21 @@ PosixError NetlinkRequestResponse(
 
   RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
 
+  return NetlinkResponse(fd, fn, expect_nlmsgerr);
+}
+
+PosixError NetlinkResponse(
+    const FileDescriptor& fd,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn,
+    bool expect_nlmsgerr) {
   constexpr size_t kBufferSize = 4096;
   std::vector<char> buf(kBufferSize);
+  struct iovec iov = {};
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
 
   // If NLM_F_MULTI is set, response is a series of messages that ends with a
   // NLMSG_DONE message.
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index e13ead406..f97276d44 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -41,6 +41,14 @@ PosixError NetlinkRequestResponse(
     const std::function<void(const struct nlmsghdr* hdr)>& fn,
     bool expect_nlmsgerr);
 
+// Call fn on all response netlink messages.
+//
+// To be used on requests with NLM_F_MULTI reponses.
+PosixError NetlinkResponse(
+    const FileDescriptor& fd,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn,
+    bool expect_nlmsgerr);
+
 // Send the passed request and call fn on all response netlink messages.
 //
 // To be used on requests without NLM_F_MULTI reponses.
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 53b678e94..a760581b5 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -753,6 +753,32 @@ PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size) {
   return ret;
 }
 
+PosixErrorOr<int> RecvTimeout(int sock, char buf[], int buf_size, int timeout) {
+  fd_set rfd;
+  struct timeval to = {.tv_sec = timeout, .tv_usec = 0};
+  FD_ZERO(&rfd);
+  FD_SET(sock, &rfd);
+
+  int ret;
+  RETURN_ERROR_IF_SYSCALL_FAIL(ret = select(1, &rfd, NULL, NULL, &to));
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      ret = RetryEINTR(recv)(sock, buf, buf_size, MSG_DONTWAIT));
+  return ret;
+}
+
+PosixErrorOr<int> RecvMsgTimeout(int sock, struct msghdr* msg, int timeout) {
+  fd_set rfd;
+  struct timeval to = {.tv_sec = timeout, .tv_usec = 0};
+  FD_ZERO(&rfd);
+  FD_SET(sock, &rfd);
+
+  int ret;
+  RETURN_ERROR_IF_SYSCALL_FAIL(ret = select(1, &rfd, NULL, NULL, &to));
+  RETURN_ERROR_IF_SYSCALL_FAIL(
+      ret = RetryEINTR(recvmsg)(sock, msg, MSG_DONTWAIT));
+  return ret;
+}
+
 void RecvNoData(int sock) {
   char data = 0;
   struct iovec iov;
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 734b48b96..5e205339f 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -467,6 +467,13 @@ PosixError FreeAvailablePort(int port);
 // SendMsg converts a buffer to an iovec and adds it to msg before sending it.
 PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size);
 
+// RecvTimeout calls select on sock with timeout and then calls recv on sock.
+PosixErrorOr<int> RecvTimeout(int sock, char buf[], int buf_size, int timeout);
+
+// RecvMsgTimeout calls select on sock with timeout and then calls recvmsg on
+// sock.
+PosixErrorOr<int> RecvMsgTimeout(int sock, msghdr* msg, int timeout);
+
 // RecvNoData checks that no data is receivable on sock.
 void RecvNoData(int sock);
 
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 99e77b89e..ad9c4bf37 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -103,6 +103,37 @@ TEST_P(StreamUnixSocketPairTest, Sendto) {
               SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(StreamUnixSocketPairTest, SetAndGetSocketLinger) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct linger sl = {1, 5};
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+      SyscallSucceedsWithValue(0));
+
+  struct linger got_linger = {};
+  socklen_t length = sizeof(sl);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+                         &got_linger, &length),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got_linger));
+  EXPECT_EQ(0, memcmp(&got_linger, &sl, length));
+}
+
+TEST_P(StreamUnixSocketPairTest, GetSocketAcceptConn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+      SyscallSucceedsWithValue(0));
+
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, StreamUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 08fc4b1b7..a1d2b9b11 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -298,6 +298,23 @@ TEST(SpliceTest, ToPipe) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, ToPipeEOF) {
+  // Create and open an empty input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor in_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Splice from the empty file to the pipe.
+  EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, 123, 0),
+              SyscallSucceedsWithValue(0));
+}
+
 TEST(SpliceTest, ToPipeOffset) {
   // Open the input file.
   const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
@@ -342,7 +359,7 @@ TEST(SpliceTest, FromPipe) {
   ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
               SyscallSucceedsWithValue(kPageSize));
 
-  // Open the input file.
+  // Open the output file.
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor out_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
@@ -364,6 +381,40 @@ TEST(SpliceTest, FromPipe) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, FromPipeMultiple) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  std::string buf = "abcABC123";
+  ASSERT_THAT(write(wfd.get(), buf.c_str(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Open the output file.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor out_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  // Splice from the pipe to the output file over several calls.
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3, 0),
+              SyscallSucceedsWithValue(3));
+
+  // Reset cursor to zero so that we can check the contents.
+  ASSERT_THAT(lseek(out_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Contents should be equal.
+  std::vector<char> rbuf(buf.size());
+  ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(rbuf.size()));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.c_str(), buf.size()), 0);
+}
+
 TEST(SpliceTest, FromPipeOffset) {
   // Create a new pipe.
   int fds[2];
@@ -693,6 +744,34 @@ TEST(SpliceTest, FromPipeMaxFileSize) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
 }
 
+TEST(SpliceTest, FromPipeToDevZero) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  const FileDescriptor zero =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+
+  // Close the write end to prevent blocking below.
+  wfd.reset();
+
+  // Splice to /dev/zero. The first call should empty the pipe, and the return
+  // value should not exceed the number of bytes available for reading.
+  EXPECT_THAT(
+      splice(rfd.get(), nullptr, zero.get(), nullptr, kPageSize + 123, 0),
+      SyscallSucceedsWithValue(kPageSize));
+  EXPECT_THAT(splice(rfd.get(), nullptr, zero.get(), nullptr, 1, 0),
+              SyscallSucceedsWithValue(0));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 2503960f3..6e7142a42 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -31,6 +31,7 @@
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
@@ -97,6 +98,11 @@ TEST_F(StatTest, FstatatSymlink) {
 }
 
 TEST_F(StatTest, Nlinks) {
+  // Skip this test if we are testing overlayfs because overlayfs does not
+  // (intentionally) return the correct nlink value for directories.
+  // See fs/overlayfs/inode.c:ovl_getattr().
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir())));
+
   TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
 
   // Directory is initially empty, it should contain 2 links (one from itself,
@@ -323,25 +329,37 @@ TEST_F(StatTest, LeadingDoubleSlash) {
   ASSERT_THAT(lstat(double_slash_path.c_str(), &double_slash_st),
               SyscallSucceeds());
   EXPECT_EQ(st.st_dev, double_slash_st.st_dev);
-  EXPECT_EQ(st.st_ino, double_slash_st.st_ino);
+  // Inode numbers for gofer-accessed files may change across save/restore.
+  if (!IsRunningWithSaveRestore()) {
+    EXPECT_EQ(st.st_ino, double_slash_st.st_ino);
+  }
 }
 
 // Test that a rename doesn't change the underlying file.
 TEST_F(StatTest, StatDoesntChangeAfterRename) {
-  const TempPath old_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath old_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const TempPath new_path(NewTempAbsPath());
 
   struct stat st_old = {};
   struct stat st_new = {};
 
-  ASSERT_THAT(stat(old_dir.path().c_str(), &st_old), SyscallSucceeds());
-  ASSERT_THAT(rename(old_dir.path().c_str(), new_path.path().c_str()),
+  ASSERT_THAT(stat(old_file.path().c_str(), &st_old), SyscallSucceeds());
+  ASSERT_THAT(rename(old_file.path().c_str(), new_path.path().c_str()),
               SyscallSucceeds());
   ASSERT_THAT(stat(new_path.path().c_str(), &st_new), SyscallSucceeds());
 
   EXPECT_EQ(st_old.st_nlink, st_new.st_nlink);
   EXPECT_EQ(st_old.st_dev, st_new.st_dev);
-  EXPECT_EQ(st_old.st_ino, st_new.st_ino);
+  // Inode numbers for gofer-accessed files on which no reference is held may
+  // change across save/restore because the information that the gofer client
+  // uses to track file identity (9P QID path) is inconsistent between gofer
+  // processes, which are restarted across save/restore.
+  //
+  // Overlay filesystems may synthesize directory inode numbers on the fly.
+  if (!IsRunningWithSaveRestore() &&
+      !ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir()))) {
+    EXPECT_EQ(st_old.st_ino, st_new.st_ino);
+  }
   EXPECT_EQ(st_old.st_mode, st_new.st_mode);
   EXPECT_EQ(st_old.st_uid, st_new.st_uid);
   EXPECT_EQ(st_old.st_gid, st_new.st_gid);
@@ -378,7 +396,9 @@ TEST_F(StatTest, LinkCountsWithRegularFileChild) {
 
 // This test verifies that inodes remain around when there is an open fd
 // after link count hits 0.
-TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
+//
+// It is marked NoSave because we don't support saving unlinked files.
+TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoSave) {
   // Setting the enviornment variable GVISOR_GOFER_UNCACHED to any value
   // will prevent this test from running, see the tmpfs lifecycle.
   //
@@ -387,9 +407,6 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
   SKIP_IF(uncached_gofer != nullptr);
 
-  // We don't support saving unlinked files.
-  const DisableSave ds;
-
   const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const TempPath child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       dir.path(), "hello", TempPath::kDefaultFileMode));
@@ -432,6 +449,11 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
 
 // Test link counts with a directory as the child.
 TEST_F(StatTest, LinkCountsWithDirChild) {
+  // Skip this test if we are testing overlayfs because overlayfs does not
+  // (intentionally) return the correct nlink value for directories.
+  // See fs/overlayfs/inode.c:ovl_getattr().
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(IsOverlayfs(GetAbsoluteTestTmpdir())));
+
   const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
 
   // Before a child is added the two links are "." and the link from the parent.
@@ -529,6 +551,26 @@ TEST_F(StatTest, LstatELOOPPath) {
   ASSERT_THAT(lstat(path.c_str(), &s), SyscallFailsWithErrno(ELOOP));
 }
 
+TEST(SimpleStatTest, DifferentFilesHaveDifferentDeviceInodeNumberPairs) {
+  // TODO(gvisor.dev/issue/1624): This test case fails in VFS1 save/restore
+  // tests because VFS1 gofer inode number assignment restarts after
+  // save/restore, such that the inodes for file1 and file2 (which are
+  // unreferenced and therefore not retained in sentry checkpoints before the
+  // calls to lstat()) are assigned the same inode number.
+  SKIP_IF(IsRunningWithVFS1() && IsRunningWithSaveRestore());
+
+  TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  TempPath file2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  MaybeSave();
+  struct stat st1 = ASSERT_NO_ERRNO_AND_VALUE(Lstat(file1.path()));
+  MaybeSave();
+  struct stat st2 = ASSERT_NO_ERRNO_AND_VALUE(Lstat(file2.path()));
+  EXPECT_FALSE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino)
+      << "both files have device number " << st1.st_dev << " and inode number "
+      << st1.st_ino;
+}
+
 // Ensure that inode allocation for anonymous devices work correctly across
 // save/restore. In particular, inode numbers should be unique across S/R.
 TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
index aca51d30f..f0fb166bd 100644
--- a/test/syscalls/linux/statfs.cc
+++ b/test/syscalls/linux/statfs.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/magic.h>
 #include <sys/statfs.h>
 #include <unistd.h>
 
@@ -43,14 +44,10 @@ TEST(StatfsTest, InternalTmpfs) {
 TEST(StatfsTest, InternalDevShm) {
   struct statfs st;
   EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
-}
-
-TEST(StatfsTest, NameLen) {
-  struct statfs st;
-  EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
 
   // This assumes that /dev/shm is tmpfs.
-  EXPECT_EQ(st.f_namelen, NAME_MAX);
+  // Note: We could be an overlay on some configurations.
+  EXPECT_TRUE(st.f_type == TMPFS_MAGIC || st.f_type == OVERLAYFS_SUPER_MAGIC);
 }
 
 TEST(FstatfsTest, CannotStatBadFd) {
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index a17ff62e9..4d9eba7f0 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -218,6 +218,36 @@ TEST(SymlinkTest, PreadFromSymlink) {
   EXPECT_THAT(unlink(linkname.c_str()), SyscallSucceeds());
 }
 
+TEST(SymlinkTest, PwriteToSymlink) {
+  std::string name = NewTempAbsPath();
+  int fd;
+  ASSERT_THAT(fd = open(name.c_str(), O_CREAT, 0644), SyscallSucceeds());
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+
+  std::string linkname = NewTempAbsPath();
+  ASSERT_THAT(symlink(name.c_str(), linkname.c_str()), SyscallSucceeds());
+
+  ASSERT_THAT(fd = open(linkname.c_str(), O_WRONLY), SyscallSucceeds());
+
+  const int data_size = 10;
+  const std::string data = std::string(data_size, 'a');
+  EXPECT_THAT(pwrite64(fd, data.c_str(), data.size(), 0),
+              SyscallSucceedsWithValue(data.size()));
+
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+  ASSERT_THAT(fd = open(name.c_str(), O_RDONLY), SyscallSucceeds());
+
+  char buf[data_size + 1];
+  EXPECT_THAT(pread64(fd, buf, data.size(), 0), SyscallSucceeds());
+  buf[data.size()] = '\0';
+  EXPECT_STREQ(buf, data.c_str());
+
+  ASSERT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(unlink(name.c_str()), SyscallSucceeds());
+  EXPECT_THAT(unlink(linkname.c_str()), SyscallSucceeds());
+}
+
 TEST(SymlinkTest, SymlinkAtDegradedPermissions_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
@@ -297,6 +327,16 @@ TEST(SymlinkTest, FollowUpdatesATime) {
   EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime);
 }
 
+TEST(SymlinkTest, SymlinkAtEmptyPath) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(symlinkat(file.path().c_str(), fd.get(), ""),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {};
 
 // Test that creating an existing symlink with creat will create the target.
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index a6325a761..ebd873068 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include <fcntl.h>
-#ifndef __fuchsia__
+#ifdef __linux__
 #include <linux/filter.h>
-#endif  // __fuchsia__
+#endif  // __linux__
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -903,6 +903,58 @@ TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
   EXPECT_EQ(err, ECONNREFUSED);
 }
 
+TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) {
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  const FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      (bind)(s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+  // Get the bound port.
+  ASSERT_THAT(
+      getsockname(s.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(connect)(
+                  s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  constexpr int kBufSz = 1 << 20;  // 1 MiB
+  std::vector<char> writebuf(kBufSz);
+
+  // Start reading the response in a loop.
+  int read_bytes = 0;
+  ScopedThread t([&s, &read_bytes]() {
+    // Too many syscalls.
+    const DisableSave ds;
+
+    char readbuf[2500] = {};
+    int n = -1;
+    while (n != 0) {
+      ASSERT_THAT(n = RetryEINTR(read)(s.get(), &readbuf, sizeof(readbuf)),
+                  SyscallSucceeds());
+      read_bytes += n;
+    }
+  });
+
+  // Try to send the whole thing.
+  int n;
+  ASSERT_THAT(n = SendFd(s.get(), writebuf.data(), kBufSz, 0),
+              SyscallSucceeds());
+
+  // We should have written the whole thing.
+  EXPECT_EQ(n, kBufSz);
+  EXPECT_THAT(shutdown(s.get(), SHUT_WR), SyscallSucceedsWithValue(0));
+  t.Join();
+
+  // We should have read the whole thing.
+  EXPECT_EQ(read_bytes, kBufSz);
+}
+
 TEST_P(SimpleTcpSocketTest, NonBlockingConnect) {
   const FileDescriptor listener =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
@@ -1586,7 +1638,7 @@ TEST_P(SimpleTcpSocketTest, SetTCPWindowClampAboveHalfMinRcvBuf) {
   }
 }
 
-#ifndef __fuchsia__
+#ifdef __linux__
 
 // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
 // gVisor currently silently ignores attaching a filter.
@@ -1620,6 +1672,8 @@ TEST_P(SimpleTcpSocketTest, SetSocketAttachDetachFilter) {
       SyscallSucceeds());
 }
 
+#endif  // __linux__
+
 TEST_P(SimpleTcpSocketTest, SetSocketDetachFilterNoInstalledFilter) {
   // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
   SKIP_IF(IsRunningOnGvisor());
@@ -1641,7 +1695,92 @@ TEST_P(SimpleTcpSocketTest, GetSocketDetachFilter) {
               SyscallFailsWithErrno(ENOPROTOOPT));
 }
 
-#endif  // __fuchsia__
+TEST_P(SimpleTcpSocketTest, CloseNonConnectedLingerOption) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  constexpr int kLingerTimeout = 10;  // Seconds.
+
+  // Set the SO_LINGER option.
+  struct linger sl = {
+      .l_onoff = 1,
+      .l_linger = kLingerTimeout,
+  };
+  ASSERT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+              SyscallSucceeds());
+
+  struct pollfd poll_fd = {
+      .fd = s.get(),
+      .events = POLLHUP,
+  };
+  constexpr int kPollTimeoutMs = 0;
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  auto const start_time = absl::Now();
+  EXPECT_THAT(close(s.release()), SyscallSucceeds());
+  auto const end_time = absl::Now();
+
+  // Close() should not linger and return immediately.
+  ASSERT_LT((end_time - start_time), absl::Seconds(kLingerTimeout));
+}
+
+// Tests that SO_ACCEPTCONN returns non zero value for listening sockets.
+TEST_P(TcpSocketTest, GetSocketAcceptConnListener) {
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(getsockopt(listener_, SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceeds());
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 1);
+}
+
+// Tests that SO_ACCEPTCONN returns zero value for not listening sockets.
+TEST_P(TcpSocketTest, GetSocketAcceptConnNonListener) {
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceeds());
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+
+  ASSERT_THAT(getsockopt(t_, SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceeds());
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetSocketAcceptConnWithShutdown) {
+  // TODO(b/171345701): Fix the TCP state for listening socket on shutdown.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(s.get(), SOMAXCONN), SyscallSucceeds());
+
+  int got = -1;
+  socklen_t length = sizeof(got);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceeds());
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 1);
+
+  EXPECT_THAT(shutdown(s.get(), SHUT_RD), SyscallSucceeds());
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ACCEPTCONN, &got, &length),
+              SyscallSucceeds());
+  ASSERT_EQ(length, sizeof(got));
+  EXPECT_EQ(got, 0);
+}
 
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 4b3c44527..cac94d9e1 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -33,6 +33,7 @@
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
 
 ABSL_FLAG(bool, timers_test_sleep, false,
           "If true, sleep forever instead of running tests.");
@@ -215,99 +216,6 @@ TEST(TimerTest, ProcessKilledOnCPUHardLimit) {
   EXPECT_GE(cpu, kHardLimit);
 }
 
-// RAII type for a kernel "POSIX" interval timer. (The kernel provides system
-// calls such as timer_create that behave very similarly, but not identically,
-// to those described by timer_create(2); in particular, the kernel does not
-// implement SIGEV_THREAD. glibc builds POSIX-compliant interval timers based on
-// these kernel interval timers.)
-//
-// Compare implementation to FileDescriptor.
-class IntervalTimer {
- public:
-  IntervalTimer() = default;
-
-  explicit IntervalTimer(int id) { set_id(id); }
-
-  IntervalTimer(IntervalTimer&& orig) : id_(orig.release()) {}
-
-  IntervalTimer& operator=(IntervalTimer&& orig) {
-    if (this == &orig) return *this;
-    reset(orig.release());
-    return *this;
-  }
-
-  IntervalTimer(const IntervalTimer& other) = delete;
-  IntervalTimer& operator=(const IntervalTimer& other) = delete;
-
-  ~IntervalTimer() { reset(); }
-
-  int get() const { return id_; }
-
-  int release() {
-    int const id = id_;
-    id_ = -1;
-    return id;
-  }
-
-  void reset() { reset(-1); }
-
-  void reset(int id) {
-    if (id_ >= 0) {
-      TEST_PCHECK(syscall(SYS_timer_delete, id_) == 0);
-      MaybeSave();
-    }
-    set_id(id);
-  }
-
-  PosixErrorOr<struct itimerspec> Set(
-      int flags, const struct itimerspec& new_value) const {
-    struct itimerspec old_value = {};
-    if (syscall(SYS_timer_settime, id_, flags, &new_value, &old_value) < 0) {
-      return PosixError(errno, "timer_settime");
-    }
-    MaybeSave();
-    return old_value;
-  }
-
-  PosixErrorOr<struct itimerspec> Get() const {
-    struct itimerspec curr_value = {};
-    if (syscall(SYS_timer_gettime, id_, &curr_value) < 0) {
-      return PosixError(errno, "timer_gettime");
-    }
-    MaybeSave();
-    return curr_value;
-  }
-
-  PosixErrorOr<int> Overruns() const {
-    int rv = syscall(SYS_timer_getoverrun, id_);
-    if (rv < 0) {
-      return PosixError(errno, "timer_getoverrun");
-    }
-    MaybeSave();
-    return rv;
-  }
-
- private:
-  void set_id(int id) { id_ = std::max(id, -1); }
-
-  // Kernel timer_t is int; glibc timer_t is void*.
-  int id_ = -1;
-};
-
-PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
-                                        const struct sigevent& sev) {
-  int timerid;
-  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
-  if (ret < 0) {
-    return PosixError(errno, "timer_create");
-  }
-  if (ret > 0) {
-    return PosixError(EINVAL, "timer_create should never return positive");
-  }
-  MaybeSave();
-  return IntervalTimer(timerid);
-}
-
 // See timerfd.cc:TimerSlack() for rationale.
 constexpr absl::Duration kTimerSlack = absl::Milliseconds(500);
 
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
index c988c6380..bfc95ed38 100644
--- a/test/syscalls/linux/truncate.cc
+++ b/test/syscalls/linux/truncate.cc
@@ -196,6 +196,26 @@ TEST(TruncateTest, FtruncateNonWriteable) {
   EXPECT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
 }
 
+// ftruncate(2) should succeed as long as the file descriptor is writeable,
+// regardless of whether the file permissions allow writing.
+TEST(TruncateTest, FtruncateWithoutWritePermission_NoRandomSave) {
+  // Drop capabilities that allow us to override file permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+  // The only time we can open a file with flags forbidden by its permissions
+  // is when we are creating the file. We cannot re-open with the same flags,
+  // so we cannot restore an fd obtained from such an operation.
+  const DisableSave ds;
+  auto path = NewTempAbsPath();
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_RDWR | O_CREAT, 0444));
+
+  // In goferfs, ftruncate may be converted to a remote truncate operation that
+  // unavoidably requires write permission.
+  SKIP_IF(IsRunningOnGvisor() && !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(path)));
+  ASSERT_THAT(ftruncate(fd.get(), 100), SyscallSucceeds());
+}
+
 TEST(TruncateTest, TruncateNonExist) {
   EXPECT_THAT(truncate("/foo/bar", 0), SyscallFailsWithErrno(ENOENT));
 }
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index 7a8ac30a4..6a488fec6 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -12,13 +12,1844 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/udp_socket_test_cases.h"
+#include <arpa/inet.h>
+#include <fcntl.h>
+
+#include <ctime>
+
+#ifdef __linux__
+#include <linux/errqueue.h>
+#include <linux/filter.h>
+#endif  // __linux__
+#include <netinet/in.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "absl/strings/str_format.h"
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
 
 namespace {
 
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest
+    : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
+ protected:
+  // Creates two sockets that will be used by test cases.
+  void SetUp() override;
+
+  // Binds the socket bind_ to the loopback and updates bind_addr_.
+  PosixError BindLoopback();
+
+  // Binds the socket bind_ to Any and updates bind_addr_.
+  PosixError BindAny();
+
+  // Binds given socket to address addr and updates.
+  PosixError BindSocket(int socket, struct sockaddr* addr);
+
+  // Return initialized Any address to port 0.
+  struct sockaddr_storage InetAnyAddr();
+
+  // Return initialized Loopback address to port 0.
+  struct sockaddr_storage InetLoopbackAddr();
+
+  // Disconnects socket sockfd.
+  void Disconnect(int sockfd);
+
+  // Get family for the test.
+  int GetFamily();
+
+  // Socket used by Bind methods
+  FileDescriptor bind_;
+
+  // Second socket used for tests.
+  FileDescriptor sock_;
+
+  // Address for bind_ socket.
+  struct sockaddr* bind_addr_;
+
+  // Initialized to the length based on GetFamily().
+  socklen_t addrlen_;
+
+  // Storage for bind_addr_.
+  struct sockaddr_storage bind_addr_storage_;
+
+ private:
+  // Helper to initialize addrlen_ for the test case.
+  socklen_t GetAddrLength();
+};
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+  switch (addr->ss_family) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+      return &sin->sin_port;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+      return &sin6->sin6_port;
+    }
+  }
+
+  return nullptr;
+}
+
+// Sets addr port to "port".
+void SetPort(struct sockaddr_storage* addr, uint16_t port) {
+  switch (addr->ss_family) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+      sin->sin_port = port;
+      break;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+      sin6->sin6_port = port;
+      break;
+    }
+  }
+}
+
+void UdpSocketTest::SetUp() {
+  addrlen_ = GetAddrLength();
+
+  bind_ =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+  memset(&bind_addr_storage_, 0, sizeof(bind_addr_storage_));
+  bind_addr_ = reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+
+  sock_ =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+}
+
+int UdpSocketTest::GetFamily() {
+  if (GetParam() == AddressFamily::kIpv4) {
+    return AF_INET;
+  }
+  return AF_INET6;
+}
+
+PosixError UdpSocketTest::BindLoopback() {
+  bind_addr_storage_ = InetLoopbackAddr();
+  struct sockaddr* bind_addr_ =
+      reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+  return BindSocket(bind_.get(), bind_addr_);
+}
+
+PosixError UdpSocketTest::BindAny() {
+  bind_addr_storage_ = InetAnyAddr();
+  struct sockaddr* bind_addr_ =
+      reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+  return BindSocket(bind_.get(), bind_addr_);
+}
+
+PosixError UdpSocketTest::BindSocket(int socket, struct sockaddr* addr) {
+  socklen_t len = sizeof(bind_addr_storage_);
+
+  // Bind, then check that we get the right address.
+  RETURN_ERROR_IF_SYSCALL_FAIL(bind(socket, addr, addrlen_));
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(getsockname(socket, addr, &len));
+
+  if (addrlen_ != len) {
+    return PosixError(
+        EINVAL,
+        absl::StrFormat("getsockname len: %u expected: %u", len, addrlen_));
+  }
+  return PosixError(0);
+}
+
+socklen_t UdpSocketTest::GetAddrLength() {
+  struct sockaddr_storage addr;
+  if (GetFamily() == AF_INET) {
+    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+    return sizeof(*sin);
+  }
+
+  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+  return sizeof(*sin6);
+}
+
+sockaddr_storage UdpSocketTest::InetAnyAddr() {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
+
+  if (GetFamily() == AF_INET) {
+    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+    sin->sin_addr.s_addr = htonl(INADDR_ANY);
+    sin->sin_port = htons(0);
+    return addr;
+  }
+
+  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+  sin6->sin6_addr = IN6ADDR_ANY_INIT;
+  sin6->sin6_port = htons(0);
+  return addr;
+}
+
+sockaddr_storage UdpSocketTest::InetLoopbackAddr() {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
+
+  if (GetFamily() == AF_INET) {
+    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+    sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+    sin->sin_port = htons(0);
+    return addr;
+  }
+  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+  sin6->sin6_addr = in6addr_loopback;
+  sin6->sin6_port = htons(0);
+  return addr;
+}
+
+void UdpSocketTest::Disconnect(int sockfd) {
+  sockaddr_storage addr_storage = InetAnyAddr();
+  sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  socklen_t addrlen = sizeof(addr_storage);
+
+  addr->sa_family = AF_UNSPEC;
+  ASSERT_THAT(connect(sockfd, addr, addrlen), SyscallSucceeds());
+
+  // Check that after disconnect the socket is bound to the ANY address.
+  EXPECT_THAT(getsockname(sockfd, addr, &addrlen), SyscallSucceeds());
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    struct in6_addr loopback = IN6ADDR_ANY_INIT;
+
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+  FileDescriptor sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+  EXPECT_THAT(close(sock.release()), SyscallSucceeds());
+
+  sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, 0));
+  EXPECT_THAT(close(sock.release()), SyscallSucceeds());
+
+  ASSERT_THAT(socket(GetFamily(), SOCK_STREAM, IPPROTO_UDP), SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+  // Check that we're not bound.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  struct sockaddr_storage any = InetAnyAddr();
+  EXPECT_EQ(memcmp(&addr, reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+            0);
+
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  EXPECT_THAT(
+      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Check that we're not connected.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallFailsWithErrno(ENOTCONN));
+
+  // Connect, then check that we get the right address.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Do send & write, they must fail.
+  char buf[512];
+  EXPECT_THAT(send(sock_.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EDESTADDRREQ));
+
+  EXPECT_THAT(write(sock_.get(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EDESTADDRREQ));
+
+  // Use sendto.
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect the socket.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+  char buf[512];
+  EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(bind_.get(), bind_addr_, addrlen_),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Check that we're still bound to the original address.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(sock_.get(), bind_addr_, addrlen_),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Send from sock_ to bind_
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  for (int i = 0; i < 2; i++) {
+    // Connet sock_ to bound address.
+    ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+    struct sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+    EXPECT_EQ(addrlen, addrlen_);
+
+    // Send from sock to bind_.
+    char buf[512];
+    RandomizeBuffer(buf, sizeof(buf));
+
+    ASSERT_THAT(sendto(bind_.get(), buf, sizeof(buf), 0,
+                       reinterpret_cast<sockaddr*>(&addr), addrlen),
+                SyscallSucceedsWithValue(sizeof(buf)));
+
+    // Receive the data.
+    char received[sizeof(buf)];
+    EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
+                SyscallSucceedsWithValue(sizeof(received)));
+    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+    // Disconnect sock_.
+    struct sockaddr unspec = {};
+    unspec.sa_family = AF_UNSPEC;
+    ASSERT_THAT(connect(sock_.get(), &unspec, sizeof(unspec.sa_family)),
+                SyscallSucceeds());
+  }
+}
+
+TEST_P(UdpSocketTest, Connect) {
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Check that we're connected to the right peer.
+  struct sockaddr_storage peer;
+  socklen_t peerlen = sizeof(peer);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+      SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
+
+  // Try to bind after connect.
+  struct sockaddr_storage any = InetAnyAddr();
+  EXPECT_THAT(
+      bind(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+      SyscallFailsWithErrno(EINVAL));
+
+  struct sockaddr_storage bind2_storage = InetLoopbackAddr();
+  struct sockaddr* bind2_addr =
+      reinterpret_cast<struct sockaddr*>(&bind2_storage);
+  FileDescriptor bind2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+  ASSERT_NO_ERRNO(BindSocket(bind2.get(), bind2_addr));
+
+  // Try to connect again.
+  EXPECT_THAT(connect(sock_.get(), bind2_addr, addrlen_), SyscallSucceeds());
+
+  // Check that peer name changed.
+  peerlen = sizeof(peer);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+      SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, bind2_addr, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectAnyZero) {
+  // TODO(138658473): Enable when we can connect to port 0 with gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  struct sockaddr_storage any = InetAnyAddr();
+  EXPECT_THAT(
+      connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+      SyscallSucceeds());
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, ConnectAnyWithPort) {
+  ASSERT_NO_ERRNO(BindAny());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
+  // TODO(138658473): Enable when we can connect to port 0 with gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+  struct sockaddr_storage any = InetAnyAddr();
+  EXPECT_THAT(
+      connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+      SyscallSucceeds());
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallFailsWithErrno(ENOTCONN));
+
+  Disconnect(sock_.get());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
+  ASSERT_NO_ERRNO(BindAny());
+  EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(*Port(&bind_addr_storage_), *Port(&addr));
+
+  Disconnect(sock_.get());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBind) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Bind to the next port above bind_.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_NO_ERRNO(BindSocket(sock_.get(), addr));
+
+  // Connect the socket.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage unspec = {};
+  unspec.ss_family = AF_UNSPEC;
+  EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&unspec),
+                      sizeof(unspec.ss_family)),
+              SyscallSucceeds());
+
+  // Check that we're still bound.
+  socklen_t addrlen = sizeof(unspec);
+  EXPECT_THAT(
+      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&unspec), &addrlen),
+      SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(addr, &unspec, addrlen_), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(sock_.get(), addr, &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
+  ASSERT_NO_ERRNO(BindAny());
+
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  socklen_t addrlen = sizeof(addr);
+
+  // Connect the socket.
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(getsockname(bind_.get(), addr, &addrlen), SyscallSucceeds());
+
+  // If the socket is bound to ANY and connected to a loopback address,
+  // getsockname() has to return the loopback address.
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
+    struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  struct sockaddr_storage any_storage = InetAnyAddr();
+  struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
+  SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
+
+  ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
+
+  // Connect the socket.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  Disconnect(sock_.get());
+
+  // Check that we're still bound.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, any, addrlen), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(
+      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+      SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, Disconnect) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  struct sockaddr_storage any_storage = InetAnyAddr();
+  struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
+  SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
+
+  for (int i = 0; i < 2; i++) {
+    // Try to connect again.
+    EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+    // Check that we're connected to the right peer.
+    struct sockaddr_storage peer;
+    socklen_t peerlen = sizeof(peer);
+    EXPECT_THAT(
+        getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+        SyscallSucceeds());
+    EXPECT_EQ(peerlen, addrlen_);
+    EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
+
+    // Try to disconnect.
+    struct sockaddr_storage addr = {};
+    addr.ss_family = AF_UNSPEC;
+    EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&addr),
+                        sizeof(addr.ss_family)),
+                SyscallSucceeds());
+
+    peerlen = sizeof(peer);
+    EXPECT_THAT(
+        getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+        SyscallFailsWithErrno(ENOTCONN));
+
+    // Check that we're still bound.
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+    EXPECT_EQ(addrlen, addrlen_);
+    EXPECT_EQ(*Port(&addr), *Port(&any_storage));
+  }
+}
+
+TEST_P(UdpSocketTest, ConnectBadAddress) {
+  struct sockaddr addr = {};
+  addr.sa_family = GetFamily();
+  ASSERT_THAT(connect(sock_.get(), &addr, sizeof(addr.sa_family)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  struct sockaddr_storage addr_storage = InetAnyAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Send to a different destination than we're connected to.
+  char buf[512];
+  EXPECT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  // Connect to loopback:bind_addr_+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind sock to loopback:bind_addr_+1.
+  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from bind_ to sock_.
+  ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {sock_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout*/ 1000),
+              SyscallSucceedsWithValue(1));
+
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind sock to loopback:bind_addr_port+1.
+  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Set sock to non-blocking.
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(sock_.get(), F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(sock_.get(), F_SETFL, opts | O_NONBLOCK),
+              SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from bind_ to sock_.
+  ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {sock_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Send some data to bind_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind sock to loopback:bind_addr_port+1.
+  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Send some data from sock to bind_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind sock to loopback:bind_addr_port+2.
+  struct sockaddr_storage addr2_storage = InetLoopbackAddr();
+  struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
+  SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
+  ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
+
+  // Send some data from sock to bind_.
+  char buf[512];
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that the data isn't received because it was sent from a different
+  // address than we're connected.
+  EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Bind sock to loopback:bind_addr_port+2.
+  struct sockaddr_storage addr2_storage = InetLoopbackAddr();
+  struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
+  SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
+  ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
+
+  // Send some data from sock to bind_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Receive the data. It works because it was sent before the connect.
+  char received[sizeof(buf)];
+  EXPECT_THAT(
+      RecvTimeout(bind_.get(), received, sizeof(received), 1 /*timeout*/),
+      IsPosixErrorOkAndHolds(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+  // Send again. This time it should not be received.
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(recv(bind_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind sock to loopback:bind_addr_port+1.
+  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Send some data from sock to bind_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data and sender address.
+  char received[sizeof(buf)];
+  struct sockaddr_storage addr2;
+  socklen_t addr2len = sizeof(addr2);
+  EXPECT_THAT(recvfrom(bind_.get(), received, sizeof(received), 0,
+                       reinterpret_cast<sockaddr*>(&addr2), &addr2len),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+  EXPECT_EQ(addr2len, addrlen_);
+  EXPECT_EQ(memcmp(addr, &addr2, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+  ASSERT_THAT(listen(sock_.get(), SOMAXCONN),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+  ASSERT_THAT(accept(sock_.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  // Bind to loopback:bind_addr_port+1 and connect to bind_addr_.
+  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Verify that we get EWOULDBLOCK when there is nothing to read.
+  char received[512];
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  const char* buf = "abc";
+  EXPECT_THAT(write(sock_.get(), buf, 3), SyscallSucceedsWithValue(3));
+
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(bind_.get(), F_SETFL, opts | O_NONBLOCK),
+              SyscallSucceeds());
+  ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
+  ASSERT_NE(opts & O_NONBLOCK, 0);
+
+  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  // We should get the data even though read has been shutdown.
+  EXPECT_THAT(RecvTimeout(bind_.get(), received, 2 /*buf_size*/, 1 /*timeout*/),
+              IsPosixErrorOkAndHolds(2));
+
+  // Because we read less than the entire packet length, since it's a packet
+  // based socket any subsequent reads should return EWOULDBLOCK.
+  EXPECT_THAT(recv(bind_.get(), received, 1, 0),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+  char received[512];
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Connect to loopback:bind_addr_port+1.
+  struct sockaddr_storage addr_storage = InetLoopbackAddr();
+  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  char received[512];
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  char received[512];
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then shutdown from another thread.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+  });
+  EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+  ASSERT_NO_ERRNO(BindLoopback());
+  EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+  EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Send some data to bind_ from another thread.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  // Receive the data prior to actually starting the other thread.
+  char received[512];
+  EXPECT_THAT(
+      RetryEINTR(recv)(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+      SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Start the thread.
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, this->bind_addr_,
+                       this->addrlen_),
+                SyscallSucceedsWithValue(sizeof(buf)));
+  });
+
+  EXPECT_THAT(RetryEINTR(recv)(bind_.get(), received, sizeof(received), 0),
+              SyscallSucceedsWithValue(512));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Send 3 packets from sock to bind_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(
+        sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(psize));
+  }
+
+  // Receive the data as 3 separate packets.
+  char received[6 * psize];
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_THAT(recv(bind_.get(), received + i * psize, 3 * psize, 0),
+                SyscallSucceedsWithValue(psize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Direct writes from sock to bind_.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from sock to bind_, where each packet's data consists of
+  // 2 discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(writev(sock_.get(), iov, 2),
+                SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(readv(bind_.get(), iov, 3),
+                SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Send 2 packets from sock to bind_, where each packet's data consists of
+  // 2 discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_name = bind_addr_;
+    msg.msg_namelen = addrlen_;
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 2;
+    ASSERT_THAT(sendmsg(sock_.get(), &msg, 0),
+                SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 3;
+    ASSERT_THAT(recvmsg(bind_.get(), &msg, 0),
+                SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  int n = -1;
+  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(bind_.get(), str, sizeof(str), 0),
+              SyscallSucceedsWithValue(sizeof(str)));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+
+  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+}
+
+// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the
+// corresponding macro and become `0x541B`.
+TEST_P(UdpSocketTest, Fionread) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from sock to bind_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(
+        sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(psize));
+
+    ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                SyscallSucceedsWithValue(1));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, psize);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from sock to bind_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(
+        sendto(sock_.get(), buf + i * psize, 0, 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(0));
+
+    // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet
+    // socket does not cause a poll event to be triggered.
+    if (!IsRunningWithHostinet()) {
+      ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                  SyscallSucceedsWithValue(1));
+    }
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, 0);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(bind_.get(), str, 0, 0), SyscallSucceedsWithValue(0));
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, SoNoCheckOffByDefault) {
+  // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  int v = -1;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoNoCheck) {
+  // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  int v = kSockOptOn;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
+              SyscallSucceeds());
+  v = -1;
+  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOn);
+  ASSERT_EQ(optlen, sizeof(v));
+
+  v = kSockOptOff;
+  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
+              SyscallSucceeds());
+  v = -1;
+  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
+#ifdef __linux__
+TEST_P(UdpSocketTest, ErrorQueue) {
+  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+  EXPECT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, MSG_ERRQUEUE),
+              SyscallFailsWithErrno(EAGAIN));
+}
+#endif  // __linux__
+
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  int v = -1;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  int v = 1;
+  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from sock to bind_.
+  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+              SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
+              SyscallSucceedsWithValue(0));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+  struct timeval tv = {};
+  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // There should be nothing to get via ioctl.
+  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+  EXPECT_THAT(shutdown(bind_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, TimestampIoctl) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from sock to bind_.
+  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(sock_.get(), SIOCGSTAMP, &tv),
+              SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from sock to bind_.
+  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+              SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {bind_.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // Enable SO_TIMESTAMP and send a message.
+  int v = 1;
+  EXPECT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
+  // There should be a message for SO_TIMESTAMP.
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg = {};
+  iovec iov = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
+              SyscallSucceedsWithValue(0));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+
+  // The ioctl should return the exact same values as before.
+  struct timeval tv2 = {};
+  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv2), SyscallSucceeds());
+  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Set socket TOS.
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+  }
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+  ASSERT_THAT(setsockopt(sock_.get(), sent_level, sent_type, &sent_tos,
+                         sizeof(sent_tos)),
+              SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_type == IPV6_TCLASS) {
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = received_cmsgbuf.size();
+  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+
+  ASSERT_NO_ERRNO(BindLoopback());
+  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  int recv_opt = kSockOptOn;
+  ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &recv_opt,
+                         sizeof(recv_opt)),
+              SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  sent_msg.msg_control = &sent_cmsgbuf[0];
+  sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+  // Manually add control message.
+  struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+  sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+  sent_cmsg->cmsg_level = sent_level;
+  sent_cmsg->cmsg_type = sent_type;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
+TEST_P(UdpSocketTest, RecvBufLimitsEmptyRcvBuf) {
+  // Discover minimum buffer size by setting it to zero.
+  constexpr int kRcvBufSz = 0;
+  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+                         sizeof(kRcvBufSz)),
+              SyscallSucceeds());
+
+  int min = 0;
+  socklen_t min_len = sizeof(min);
+  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+              SyscallSucceeds());
+
+  // Bind bind_ to loopback.
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  {
+    // Send data of size min and verify that it's received.
+    std::vector<char> buf(min);
+    RandomizeBuffer(buf.data(), buf.size());
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+    std::vector<char> received(buf.size());
+    EXPECT_THAT(RecvTimeout(bind_.get(), received.data(), received.size(),
+                            1 /*timeout*/),
+                IsPosixErrorOkAndHolds(received.size()));
+  }
+
+  {
+    // Send data of size min + 1 and verify that its received. Both linux and
+    // Netstack accept a dgram that exceeds rcvBuf limits if the receive buffer
+    // is currently empty.
+    std::vector<char> buf(min + 1);
+    RandomizeBuffer(buf.data(), buf.size());
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+
+    std::vector<char> received(buf.size());
+    ASSERT_THAT(RecvTimeout(bind_.get(), received.data(), received.size(),
+                            1 /*timeout*/),
+                IsPosixErrorOkAndHolds(received.size()));
+  }
+}
+
+// Test that receive buffer limits are enforced.
+TEST_P(UdpSocketTest, RecvBufLimits) {
+  // Bind s_ to loopback.
+  ASSERT_NO_ERRNO(BindLoopback());
+
+  int min = 0;
+  {
+    // Discover minimum buffer size by trying to set it to zero.
+    constexpr int kRcvBufSz = 0;
+    ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+                           sizeof(kRcvBufSz)),
+                SyscallSucceeds());
+
+    socklen_t min_len = sizeof(min);
+    ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+                SyscallSucceeds());
+  }
+
+  // Now set the limit to min * 4.
+  int new_rcv_buf_sz = min * 4;
+  if (!IsRunningOnGvisor() || IsRunningWithHostinet()) {
+    // Linux doubles the value specified so just set to min * 2.
+    new_rcv_buf_sz = min * 2;
+  }
+
+  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &new_rcv_buf_sz,
+                         sizeof(new_rcv_buf_sz)),
+              SyscallSucceeds());
+  int rcv_buf_sz = 0;
+  {
+    socklen_t rcv_buf_len = sizeof(rcv_buf_sz);
+    ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz,
+                           &rcv_buf_len),
+                SyscallSucceeds());
+  }
+
+  {
+    std::vector<char> buf(min);
+    RandomizeBuffer(buf.data(), buf.size());
+
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+    ASSERT_THAT(
+        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+        SyscallSucceedsWithValue(buf.size()));
+    int sent = 4;
+    if (IsRunningOnGvisor() && !IsRunningWithHostinet()) {
+      // Linux seems to drop the 4th packet even though technically it should
+      // fit in the receive buffer.
+      ASSERT_THAT(
+          sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+          SyscallSucceedsWithValue(buf.size()));
+      sent++;
+    }
+
+    for (int i = 0; i < sent - 1; i++) {
+      // Receive the data.
+      std::vector<char> received(buf.size());
+      EXPECT_THAT(RecvTimeout(bind_.get(), received.data(), received.size(),
+                              1 /*timeout*/),
+                  IsPosixErrorOkAndHolds(received.size()));
+      EXPECT_EQ(memcmp(buf.data(), received.data(), buf.size()), 0);
+    }
+
+    // The last receive should fail with EAGAIN as the last packet should have
+    // been dropped due to lack of space in the receive buffer.
+    std::vector<char> received(buf.size());
+    EXPECT_THAT(
+        recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
+        SyscallFailsWithErrno(EAGAIN));
+  }
+}
+
+#ifdef __linux__
+
+// TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+// gVisor currently silently ignores attaching a filter.
+TEST_P(UdpSocketTest, SetSocketDetachFilter) {
+  // Program generated using sudo tcpdump -i lo udp and port 1234 -dd
+  struct sock_filter code[] = {
+      {0x28, 0, 0, 0x0000000c},  {0x15, 0, 6, 0x000086dd},
+      {0x30, 0, 0, 0x00000014},  {0x15, 0, 15, 0x00000011},
+      {0x28, 0, 0, 0x00000036},  {0x15, 12, 0, 0x000004d2},
+      {0x28, 0, 0, 0x00000038},  {0x15, 10, 11, 0x000004d2},
+      {0x15, 0, 10, 0x00000800}, {0x30, 0, 0, 0x00000017},
+      {0x15, 0, 8, 0x00000011},  {0x28, 0, 0, 0x00000014},
+      {0x45, 6, 0, 0x00001fff},  {0xb1, 0, 0, 0x0000000e},
+      {0x48, 0, 0, 0x0000000e},  {0x15, 2, 0, 0x000004d2},
+      {0x48, 0, 0, 0x00000010},  {0x15, 0, 1, 0x000004d2},
+      {0x6, 0, 0, 0x00040000},   {0x6, 0, 0, 0x00000000},
+  };
+  struct sock_fprog bpf = {
+      .len = ABSL_ARRAYSIZE(code),
+      .filter = code,
+  };
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)),
+      SyscallSucceeds());
+
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallSucceeds());
+}
+
+#endif  // __linux__
+
+TEST_P(UdpSocketTest, SetSocketDetachFilterNoInstalledFilter) {
+  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
+  SKIP_IF(IsRunningOnGvisor());
+  constexpr int val = 0;
+  ASSERT_THAT(
+      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, GetSocketDetachFilter) {
+  int val = 0;
+  socklen_t val_len = sizeof(val);
+  ASSERT_THAT(
+      getsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
+      SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
                          ::testing::Values(AddressFamily::kIpv4,
                                            AddressFamily::kIpv6,
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
deleted file mode 100644
index 54a0594f7..000000000
--- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef __fuchsia__
-
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <linux/errqueue.h>
-#include <netinet/in.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/udp_socket_test_cases.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-#include "test/util/thread_util.h"
-
-namespace gvisor {
-namespace testing {
-
-TEST_P(UdpSocketTest, ErrorQueue) {
-  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
-  EXPECT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, MSG_ERRQUEUE),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // __fuchsia__
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
deleted file mode 100644
index 60c48ed6e..000000000
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ /dev/null
@@ -1,1781 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "test/syscalls/linux/udp_socket_test_cases.h"
-
-#include <arpa/inet.h>
-#include <fcntl.h>
-#ifndef __fuchsia__
-#include <linux/filter.h>
-#endif  // __fuchsia__
-#include <netinet/in.h>
-#include <poll.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include "absl/strings/str_format.h"
-#ifndef SIOCGSTAMP
-#include <linux/sockios.h>
-#endif
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "test/syscalls/linux/ip_socket_test_util.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/file_descriptor.h"
-#include "test/util/posix_error.h"
-#include "test/util/test_util.h"
-#include "test/util/thread_util.h"
-
-namespace gvisor {
-namespace testing {
-
-// Gets a pointer to the port component of the given address.
-uint16_t* Port(struct sockaddr_storage* addr) {
-  switch (addr->ss_family) {
-    case AF_INET: {
-      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
-      return &sin->sin_port;
-    }
-    case AF_INET6: {
-      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
-      return &sin6->sin6_port;
-    }
-  }
-
-  return nullptr;
-}
-
-// Sets addr port to "port".
-void SetPort(struct sockaddr_storage* addr, uint16_t port) {
-  switch (addr->ss_family) {
-    case AF_INET: {
-      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
-      sin->sin_port = port;
-      break;
-    }
-    case AF_INET6: {
-      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
-      sin6->sin6_port = port;
-      break;
-    }
-  }
-}
-
-void UdpSocketTest::SetUp() {
-  addrlen_ = GetAddrLength();
-
-  bind_ =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
-  memset(&bind_addr_storage_, 0, sizeof(bind_addr_storage_));
-  bind_addr_ = reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
-
-  sock_ =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
-}
-
-int UdpSocketTest::GetFamily() {
-  if (GetParam() == AddressFamily::kIpv4) {
-    return AF_INET;
-  }
-  return AF_INET6;
-}
-
-PosixError UdpSocketTest::BindLoopback() {
-  bind_addr_storage_ = InetLoopbackAddr();
-  struct sockaddr* bind_addr_ =
-      reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
-  return BindSocket(bind_.get(), bind_addr_);
-}
-
-PosixError UdpSocketTest::BindAny() {
-  bind_addr_storage_ = InetAnyAddr();
-  struct sockaddr* bind_addr_ =
-      reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
-  return BindSocket(bind_.get(), bind_addr_);
-}
-
-PosixError UdpSocketTest::BindSocket(int socket, struct sockaddr* addr) {
-  socklen_t len = sizeof(bind_addr_storage_);
-
-  // Bind, then check that we get the right address.
-  RETURN_ERROR_IF_SYSCALL_FAIL(bind(socket, addr, addrlen_));
-
-  RETURN_ERROR_IF_SYSCALL_FAIL(getsockname(socket, addr, &len));
-
-  if (addrlen_ != len) {
-    return PosixError(
-        EINVAL,
-        absl::StrFormat("getsockname len: %u expected: %u", len, addrlen_));
-  }
-  return PosixError(0);
-}
-
-socklen_t UdpSocketTest::GetAddrLength() {
-  struct sockaddr_storage addr;
-  if (GetFamily() == AF_INET) {
-    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
-    return sizeof(*sin);
-  }
-
-  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
-  return sizeof(*sin6);
-}
-
-sockaddr_storage UdpSocketTest::InetAnyAddr() {
-  struct sockaddr_storage addr;
-  memset(&addr, 0, sizeof(addr));
-  reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
-
-  if (GetFamily() == AF_INET) {
-    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
-    sin->sin_addr.s_addr = htonl(INADDR_ANY);
-    sin->sin_port = htons(0);
-    return addr;
-  }
-
-  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
-  sin6->sin6_addr = IN6ADDR_ANY_INIT;
-  sin6->sin6_port = htons(0);
-  return addr;
-}
-
-sockaddr_storage UdpSocketTest::InetLoopbackAddr() {
-  struct sockaddr_storage addr;
-  memset(&addr, 0, sizeof(addr));
-  reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
-
-  if (GetFamily() == AF_INET) {
-    auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
-    sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-    sin->sin_port = htons(0);
-    return addr;
-  }
-  auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
-  sin6->sin6_addr = in6addr_loopback;
-  sin6->sin6_port = htons(0);
-  return addr;
-}
-
-void UdpSocketTest::Disconnect(int sockfd) {
-  sockaddr_storage addr_storage = InetAnyAddr();
-  sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  socklen_t addrlen = sizeof(addr_storage);
-
-  addr->sa_family = AF_UNSPEC;
-  ASSERT_THAT(connect(sockfd, addr, addrlen), SyscallSucceeds());
-
-  // Check that after disconnect the socket is bound to the ANY address.
-  EXPECT_THAT(getsockname(sockfd, addr, &addrlen), SyscallSucceeds());
-  if (GetParam() == AddressFamily::kIpv4) {
-    auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
-  } else {
-    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    struct in6_addr loopback = IN6ADDR_ANY_INIT;
-
-    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, Creation) {
-  FileDescriptor sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
-  EXPECT_THAT(close(sock.release()), SyscallSucceeds());
-
-  sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, 0));
-  EXPECT_THAT(close(sock.release()), SyscallSucceeds());
-
-  ASSERT_THAT(socket(GetFamily(), SOCK_STREAM, IPPROTO_UDP), SyscallFails());
-}
-
-TEST_P(UdpSocketTest, Getsockname) {
-  // Check that we're not bound.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  struct sockaddr_storage any = InetAnyAddr();
-  EXPECT_EQ(memcmp(&addr, reinterpret_cast<struct sockaddr*>(&any), addrlen_),
-            0);
-
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  EXPECT_THAT(
-      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Getpeername) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Check that we're not connected.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallFailsWithErrno(ENOTCONN));
-
-  // Connect, then check that we get the right address.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, SendNotConnected) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Do send & write, they must fail.
-  char buf[512];
-  EXPECT_THAT(send(sock_.get(), buf, sizeof(buf), 0),
-              SyscallFailsWithErrno(EDESTADDRREQ));
-
-  EXPECT_THAT(write(sock_.get(), buf, sizeof(buf)),
-              SyscallFailsWithErrno(EDESTADDRREQ));
-
-  // Use sendto.
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ConnectBinds) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect the socket.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveNotBound) {
-  char buf[512];
-  EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, Bind) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(bind_.get(), bind_addr_, addrlen_),
-              SyscallFailsWithErrno(EINVAL));
-
-  // Check that we're still bound to the original address.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, BindInUse) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(sock_.get(), bind_addr_, addrlen_),
-              SyscallFailsWithErrno(EADDRINUSE));
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterConnect) {
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Send from sock_ to bind_
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  for (int i = 0; i < 2; i++) {
-    // Connet sock_ to bound address.
-    ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-    struct sockaddr_storage addr;
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-    EXPECT_EQ(addrlen, addrlen_);
-
-    // Send from sock to bind_.
-    char buf[512];
-    RandomizeBuffer(buf, sizeof(buf));
-
-    ASSERT_THAT(sendto(bind_.get(), buf, sizeof(buf), 0,
-                       reinterpret_cast<sockaddr*>(&addr), addrlen),
-                SyscallSucceedsWithValue(sizeof(buf)));
-
-    // Receive the data.
-    char received[sizeof(buf)];
-    EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
-                SyscallSucceedsWithValue(sizeof(received)));
-    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-    // Disconnect sock_.
-    struct sockaddr unspec = {};
-    unspec.sa_family = AF_UNSPEC;
-    ASSERT_THAT(connect(sock_.get(), &unspec, sizeof(unspec.sa_family)),
-                SyscallSucceeds());
-  }
-}
-
-TEST_P(UdpSocketTest, Connect) {
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Check that we're connected to the right peer.
-  struct sockaddr_storage peer;
-  socklen_t peerlen = sizeof(peer);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
-      SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
-
-  // Try to bind after connect.
-  struct sockaddr_storage any = InetAnyAddr();
-  EXPECT_THAT(
-      bind(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
-      SyscallFailsWithErrno(EINVAL));
-
-  struct sockaddr_storage bind2_storage = InetLoopbackAddr();
-  struct sockaddr* bind2_addr =
-      reinterpret_cast<struct sockaddr*>(&bind2_storage);
-  FileDescriptor bind2 =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
-  ASSERT_NO_ERRNO(BindSocket(bind2.get(), bind2_addr));
-
-  // Try to connect again.
-  EXPECT_THAT(connect(sock_.get(), bind2_addr, addrlen_), SyscallSucceeds());
-
-  // Check that peer name changed.
-  peerlen = sizeof(peer);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
-      SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, bind2_addr, addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, ConnectAnyZero) {
-  // TODO(138658473): Enable when we can connect to port 0 with gVisor.
-  SKIP_IF(IsRunningOnGvisor());
-
-  struct sockaddr_storage any = InetAnyAddr();
-  EXPECT_THAT(
-      connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
-      SyscallSucceeds());
-
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, ConnectAnyWithPort) {
-  ASSERT_NO_ERRNO(BindAny());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
-  // TODO(138658473): Enable when we can connect to port 0 with gVisor.
-  SKIP_IF(IsRunningOnGvisor());
-  struct sockaddr_storage any = InetAnyAddr();
-  EXPECT_THAT(
-      connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
-      SyscallSucceeds());
-
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallFailsWithErrno(ENOTCONN));
-
-  Disconnect(sock_.get());
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
-  ASSERT_NO_ERRNO(BindAny());
-  EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(*Port(&bind_addr_storage_), *Port(&addr));
-
-  Disconnect(sock_.get());
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBind) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Bind to the next port above bind_.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_NO_ERRNO(BindSocket(sock_.get(), addr));
-
-  // Connect the socket.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage unspec = {};
-  unspec.ss_family = AF_UNSPEC;
-  EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&unspec),
-                      sizeof(unspec.ss_family)),
-              SyscallSucceeds());
-
-  // Check that we're still bound.
-  socklen_t addrlen = sizeof(unspec);
-  EXPECT_THAT(
-      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&unspec), &addrlen),
-      SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(addr, &unspec, addrlen_), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(sock_.get(), addr, &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
-  ASSERT_NO_ERRNO(BindAny());
-
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  socklen_t addrlen = sizeof(addr);
-
-  // Connect the socket.
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(getsockname(bind_.get(), addr, &addrlen), SyscallSucceeds());
-
-  // If the socket is bound to ANY and connected to a loopback address,
-  // getsockname() has to return the loopback address.
-  if (GetParam() == AddressFamily::kIpv4) {
-    auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
-  } else {
-    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
-    struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  struct sockaddr_storage any_storage = InetAnyAddr();
-  struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
-  SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
-
-  ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
-
-  // Connect the socket.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  Disconnect(sock_.get());
-
-  // Check that we're still bound.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, any, addrlen), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(
-      getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-      SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, Disconnect) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  struct sockaddr_storage any_storage = InetAnyAddr();
-  struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
-  SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
-
-  for (int i = 0; i < 2; i++) {
-    // Try to connect again.
-    EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-    // Check that we're connected to the right peer.
-    struct sockaddr_storage peer;
-    socklen_t peerlen = sizeof(peer);
-    EXPECT_THAT(
-        getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
-        SyscallSucceeds());
-    EXPECT_EQ(peerlen, addrlen_);
-    EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
-
-    // Try to disconnect.
-    struct sockaddr_storage addr = {};
-    addr.ss_family = AF_UNSPEC;
-    EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&addr),
-                        sizeof(addr.ss_family)),
-                SyscallSucceeds());
-
-    peerlen = sizeof(peer);
-    EXPECT_THAT(
-        getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
-        SyscallFailsWithErrno(ENOTCONN));
-
-    // Check that we're still bound.
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-    EXPECT_EQ(addrlen, addrlen_);
-    EXPECT_EQ(*Port(&addr), *Port(&any_storage));
-  }
-}
-
-TEST_P(UdpSocketTest, ConnectBadAddress) {
-  struct sockaddr addr = {};
-  addr.sa_family = GetFamily();
-  ASSERT_THAT(connect(sock_.get(), &addr, sizeof(addr.sa_family)),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  struct sockaddr_storage addr_storage = InetAnyAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Send to a different destination than we're connected to.
-  char buf[512];
-  EXPECT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
-  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  // Connect to loopback:bind_addr_+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind sock to loopback:bind_addr_+1.
-  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from bind_ to sock_.
-  ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
-
-  struct pollfd pfd = {sock_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout*/ 1000),
-              SyscallSucceedsWithValue(1));
-
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
-  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind sock to loopback:bind_addr_port+1.
-  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Set sock to non-blocking.
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(sock_.get(), F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(sock_.get(), F_SETFL, opts | O_NONBLOCK),
-              SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from bind_ to sock_.
-  ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
-
-  struct pollfd pfd = {sock_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-  EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Send some data to bind_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveConnected) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind sock to loopback:TestPort+1.
-  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Send some data from sock to bind_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind sock to loopback:bind_addr_port+2.
-  struct sockaddr_storage addr2_storage = InetLoopbackAddr();
-  struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
-  SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
-  ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
-
-  // Send some data from sock to bind_.
-  char buf[512];
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that the data isn't received because it was sent from a different
-  // address than we're connected.
-  EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Bind sock to loopback:bind_addr_port+2.
-  struct sockaddr_storage addr2_storage = InetLoopbackAddr();
-  struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
-  SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
-  ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
-
-  // Send some data from sock to bind_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Connect to loopback:TestPort+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Receive the data. It works because it was sent before the connect.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-  // Send again. This time it should not be received.
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  EXPECT_THAT(recv(bind_.get(), buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveFrom) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind sock to loopback:TestPort+1.
-  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Send some data from sock to bind_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data and sender address.
-  char received[sizeof(buf)];
-  struct sockaddr_storage addr2;
-  socklen_t addr2len = sizeof(addr2);
-  EXPECT_THAT(recvfrom(bind_.get(), received, sizeof(received), 0,
-                       reinterpret_cast<sockaddr*>(&addr2), &addr2len),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-  EXPECT_EQ(addr2len, addrlen_);
-  EXPECT_EQ(memcmp(addr, &addr2, addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Listen) {
-  ASSERT_THAT(listen(sock_.get(), SOMAXCONN),
-              SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-TEST_P(UdpSocketTest, Accept) {
-  ASSERT_THAT(accept(sock_.get(), nullptr, nullptr),
-              SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-// This test validates that a read shutdown with pending data allows the read
-// to proceed with the data before returning EAGAIN.
-TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  // Bind to loopback:bind_addr_port+1 and connect to bind_addr_.
-  ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Verify that we get EWOULDBLOCK when there is nothing to read.
-  char received[512];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  const char* buf = "abc";
-  EXPECT_THAT(write(sock_.get(), buf, 3), SyscallSucceedsWithValue(3));
-
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(bind_.get(), F_SETFL, opts | O_NONBLOCK),
-              SyscallSucceeds());
-  ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
-  ASSERT_NE(opts & O_NONBLOCK, 0);
-
-  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  // We should get the data even though read has been shutdown.
-  EXPECT_THAT(recv(bind_.get(), received, 2, 0), SyscallSucceedsWithValue(2));
-
-  // Because we read less than the entire packet length, since it's a packet
-  // based socket any subsequent reads should return EWOULDBLOCK.
-  EXPECT_THAT(recv(bind_.get(), received, 1, 0),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-// This test is validating that even after a socket is shutdown if it's
-// reconnected it will reset the shutdown state.
-TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
-  char received[512];
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Connect to loopback:bind_addr_port+1.
-  struct sockaddr_storage addr_storage = InetLoopbackAddr();
-  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
-  SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
-  ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReadShutdown) {
-  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
-  // MSG_DONTWAIT blocks indefinitely.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  char received[512];
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
-
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
-  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
-  // MSG_DONTWAIT blocks indefinitely.
-  SKIP_IF(IsRunningWithHostinet());
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  char received[512];
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then shutdown from another thread.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
-  });
-  EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-  t.Join();
-
-  EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, WriteShutdown) {
-  ASSERT_NO_ERRNO(BindLoopback());
-  EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-  EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, SynchronousReceive) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Send some data to bind_ from another thread.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  // Receive the data prior to actually starting the other thread.
-  char received[512];
-  EXPECT_THAT(
-      RetryEINTR(recv)(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
-      SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Start the thread.
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, this->bind_addr_,
-                       this->addrlen_),
-                SyscallSucceedsWithValue(sizeof(buf)));
-  });
-
-  EXPECT_THAT(RetryEINTR(recv)(bind_.get(), received, sizeof(received), 0),
-              SyscallSucceedsWithValue(512));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Send 3 packets from sock to bind_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(
-        sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(psize));
-  }
-
-  // Receive the data as 3 separate packets.
-  char received[6 * psize];
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_THAT(recv(bind_.get(), received + i * psize, 3 * psize, 0),
-                SyscallSucceedsWithValue(psize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Direct writes from sock to bind_.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Send 2 packets from sock to bind_, where each packet's data consists of
-  // 2 discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(writev(sock_.get(), iov, 2),
-                SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(readv(bind_.get(), iov, 3),
-                SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Send 2 packets from sock to bind_, where each packet's data consists of
-  // 2 discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_name = bind_addr_;
-    msg.msg_namelen = addrlen_;
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 2;
-    ASSERT_THAT(sendmsg(sock_.get(), &msg, 0),
-                SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 3;
-    ASSERT_THAT(recvmsg(bind_.get(), &msg, 0),
-                SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADShutdown) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  int n = -1;
-  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(bind_.get(), str, sizeof(str), 0),
-              SyscallSucceedsWithValue(sizeof(str)));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-
-  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-}
-
-// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the
-// corresponding macro and become `0x541B`.
-TEST_P(UdpSocketTest, Fionread) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from sock to bind_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(
-        sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(psize));
-
-    ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-                SyscallSucceedsWithValue(1));
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, psize);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from sock to bind_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(
-        sendto(sock_.get(), buf + i * psize, 0, 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(0));
-
-    // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet
-    // socket does not cause a poll event to be triggered.
-    if (!IsRunningWithHostinet()) {
-      ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-                  SyscallSucceedsWithValue(1));
-    }
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, 0);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(bind_.get(), str, 0, 0), SyscallSucceedsWithValue(0));
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, SoNoCheckOffByDefault) {
-  // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
-  // hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  int v = -1;
-  socklen_t optlen = sizeof(v);
-  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOff);
-  ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoNoCheck) {
-  // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
-  // hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  int v = kSockOptOn;
-  socklen_t optlen = sizeof(v);
-  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
-              SyscallSucceeds());
-  v = -1;
-  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOn);
-  ASSERT_EQ(optlen, sizeof(v));
-
-  v = kSockOptOff;
-  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
-              SyscallSucceeds());
-  v = -1;
-  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOff);
-  ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
-  // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
-  // hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  int v = -1;
-  socklen_t optlen = sizeof(v);
-  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOff);
-  ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoTimestamp) {
-  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
-  // supported by hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  int v = 1;
-  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from sock to bind_.
-  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
-              SyscallSucceedsWithValue(0));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
-              SyscallSucceedsWithValue(0));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
-  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
-
-  struct timeval tv = {};
-  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
-
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // There should be nothing to get via ioctl.
-  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv),
-              SyscallFailsWithErrno(ENOENT));
-}
-
-TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
-  EXPECT_THAT(shutdown(bind_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, TimestampIoctl) {
-  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from sock to bind_.
-  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-}
-
-TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
-  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(sock_.get(), SIOCGSTAMP, &tv),
-              SyscallFailsWithErrno(ENOENT));
-}
-
-// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
-// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
-TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
-  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
-  // supported by hostinet.
-  SKIP_IF(IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from sock to bind_.
-  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
-              SyscallSucceedsWithValue(0));
-
-  struct pollfd pfd = {bind_.get(), POLLIN, 0};
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // Enable SO_TIMESTAMP and send a message.
-  int v = 1;
-  EXPECT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-  ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
-              SyscallSucceedsWithValue(0));
-
-  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
-              SyscallSucceedsWithValue(1));
-
-  // There should be a message for SO_TIMESTAMP.
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg = {};
-  iovec iov = {};
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
-              SyscallSucceedsWithValue(0));
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-
-  // The ioctl should return the exact same values as before.
-  struct timeval tv2 = {};
-  ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv2), SyscallSucceeds());
-  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
-  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
-}
-
-// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
-// outgoing packets, and that a receiving socket with IP_RECVTOS or
-// IPV6_RECVTCLASS will create the corresponding control message.
-TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Allow socket to receive control message.
-  int recv_level = SOL_IP;
-  int recv_type = IP_RECVTOS;
-  if (GetParam() != AddressFamily::kIpv4) {
-    recv_level = SOL_IPV6;
-    recv_type = IPV6_RECVTCLASS;
-  }
-  ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &kSockOptOn,
-                         sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  // Set socket TOS.
-  int sent_level = recv_level;
-  int sent_type = IP_TOS;
-  if (sent_level == SOL_IPV6) {
-    sent_type = IPV6_TCLASS;
-  }
-  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
-  ASSERT_THAT(setsockopt(sock_.get(), sent_level, sent_type, &sent_tos,
-                         sizeof(sent_tos)),
-              SyscallSucceeds());
-
-  // Prepare message to send.
-  constexpr size_t kDataLength = 1024;
-  struct msghdr sent_msg = {};
-  struct iovec sent_iov = {};
-  char sent_data[kDataLength];
-  sent_iov.iov_base = &sent_data[0];
-  sent_iov.iov_len = kDataLength;
-  sent_msg.msg_iov = &sent_iov;
-  sent_msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
-              SyscallSucceedsWithValue(kDataLength));
-
-  // Receive message.
-  struct msghdr received_msg = {};
-  struct iovec received_iov = {};
-  char received_data[kDataLength];
-  received_iov.iov_base = &received_data[0];
-  received_iov.iov_len = kDataLength;
-  received_msg.msg_iov = &received_iov;
-  received_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8_t);
-  if (sent_type == IPV6_TCLASS) {
-    cmsg_data_len = sizeof(int);
-  }
-  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
-  received_msg.msg_control = &received_cmsgbuf[0];
-  received_msg.msg_controllen = received_cmsgbuf.size();
-  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
-              SyscallSucceedsWithValue(kDataLength));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
-  EXPECT_EQ(cmsg->cmsg_level, sent_level);
-  EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8_t received_tos = 0;
-  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
-  EXPECT_EQ(received_tos, sent_tos);
-}
-
-// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
-// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
-// IPV6_RECVTCLASS will create the corresponding control message.
-TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
-
-  ASSERT_NO_ERRNO(BindLoopback());
-  ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
-
-  // Allow socket to receive control message.
-  int recv_level = SOL_IP;
-  int recv_type = IP_RECVTOS;
-  if (GetParam() != AddressFamily::kIpv4) {
-    recv_level = SOL_IPV6;
-    recv_type = IPV6_RECVTCLASS;
-  }
-  int recv_opt = kSockOptOn;
-  ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &recv_opt,
-                         sizeof(recv_opt)),
-              SyscallSucceeds());
-
-  // Prepare message to send.
-  constexpr size_t kDataLength = 1024;
-  int sent_level = recv_level;
-  int sent_type = IP_TOS;
-  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
-
-  struct msghdr sent_msg = {};
-  struct iovec sent_iov = {};
-  char sent_data[kDataLength];
-  sent_iov.iov_base = &sent_data[0];
-  sent_iov.iov_len = kDataLength;
-  sent_msg.msg_iov = &sent_iov;
-  sent_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8_t);
-  if (sent_level == SOL_IPV6) {
-    sent_type = IPV6_TCLASS;
-    cmsg_data_len = sizeof(int);
-  }
-  std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
-  sent_msg.msg_control = &sent_cmsgbuf[0];
-  sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
-
-  // Manually add control message.
-  struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
-  sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
-  sent_cmsg->cmsg_level = sent_level;
-  sent_cmsg->cmsg_type = sent_type;
-  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
-
-  ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
-              SyscallSucceedsWithValue(kDataLength));
-
-  // Receive message.
-  struct msghdr received_msg = {};
-  struct iovec received_iov = {};
-  char received_data[kDataLength];
-  received_iov.iov_base = &received_data[0];
-  received_iov.iov_len = kDataLength;
-  received_msg.msg_iov = &received_iov;
-  received_msg.msg_iovlen = 1;
-  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
-  received_msg.msg_control = &received_cmsgbuf[0];
-  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
-  ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
-              SyscallSucceedsWithValue(kDataLength));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
-  EXPECT_EQ(cmsg->cmsg_level, sent_level);
-  EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8_t received_tos = 0;
-  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
-  EXPECT_EQ(received_tos, sent_tos);
-}
-
-TEST_P(UdpSocketTest, RecvBufLimitsEmptyRcvBuf) {
-  // Discover minimum buffer size by setting it to zero.
-  constexpr int kRcvBufSz = 0;
-  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
-                         sizeof(kRcvBufSz)),
-              SyscallSucceeds());
-
-  int min = 0;
-  socklen_t min_len = sizeof(min);
-  ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
-              SyscallSucceeds());
-
-  // Bind bind_ to loopback.
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  {
-    // Send data of size min and verify that it's received.
-    std::vector<char> buf(min);
-    RandomizeBuffer(buf.data(), buf.size());
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-    std::vector<char> received(buf.size());
-    EXPECT_THAT(
-        recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
-        SyscallSucceedsWithValue(received.size()));
-  }
-
-  {
-    // Send data of size min + 1 and verify that its received. Both linux and
-    // Netstack accept a dgram that exceeds rcvBuf limits if the receive buffer
-    // is currently empty.
-    std::vector<char> buf(min + 1);
-    RandomizeBuffer(buf.data(), buf.size());
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-
-    std::vector<char> received(buf.size());
-    EXPECT_THAT(
-        recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
-        SyscallSucceedsWithValue(received.size()));
-  }
-}
-
-// Test that receive buffer limits are enforced.
-TEST_P(UdpSocketTest, RecvBufLimits) {
-  // Bind s_ to loopback.
-  ASSERT_NO_ERRNO(BindLoopback());
-
-  int min = 0;
-  {
-    // Discover minimum buffer size by trying to set it to zero.
-    constexpr int kRcvBufSz = 0;
-    ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
-                           sizeof(kRcvBufSz)),
-                SyscallSucceeds());
-
-    socklen_t min_len = sizeof(min);
-    ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
-                SyscallSucceeds());
-  }
-
-  // Now set the limit to min * 4.
-  int new_rcv_buf_sz = min * 4;
-  if (!IsRunningOnGvisor() || IsRunningWithHostinet()) {
-    // Linux doubles the value specified so just set to min * 2.
-    new_rcv_buf_sz = min * 2;
-  }
-
-  ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &new_rcv_buf_sz,
-                         sizeof(new_rcv_buf_sz)),
-              SyscallSucceeds());
-  int rcv_buf_sz = 0;
-  {
-    socklen_t rcv_buf_len = sizeof(rcv_buf_sz);
-    ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz,
-                           &rcv_buf_len),
-                SyscallSucceeds());
-  }
-
-  {
-    std::vector<char> buf(min);
-    RandomizeBuffer(buf.data(), buf.size());
-
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-    ASSERT_THAT(
-        sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-        SyscallSucceedsWithValue(buf.size()));
-    int sent = 4;
-    if (IsRunningOnGvisor() && !IsRunningWithHostinet()) {
-      // Linux seems to drop the 4th packet even though technically it should
-      // fit in the receive buffer.
-      ASSERT_THAT(
-          sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
-          SyscallSucceedsWithValue(buf.size()));
-      sent++;
-    }
-
-    for (int i = 0; i < sent - 1; i++) {
-      // Receive the data.
-      std::vector<char> received(buf.size());
-      EXPECT_THAT(
-          recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
-          SyscallSucceedsWithValue(received.size()));
-      EXPECT_EQ(memcmp(buf.data(), received.data(), buf.size()), 0);
-    }
-
-    // The last receive should fail with EAGAIN as the last packet should have
-    // been dropped due to lack of space in the receive buffer.
-    std::vector<char> received(buf.size());
-    EXPECT_THAT(
-        recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
-        SyscallFailsWithErrno(EAGAIN));
-  }
-}
-
-#ifndef __fuchsia__
-
-// TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
-// gVisor currently silently ignores attaching a filter.
-TEST_P(UdpSocketTest, SetSocketDetachFilter) {
-  // Program generated using sudo tcpdump -i lo udp and port 1234 -dd
-  struct sock_filter code[] = {
-      {0x28, 0, 0, 0x0000000c},  {0x15, 0, 6, 0x000086dd},
-      {0x30, 0, 0, 0x00000014},  {0x15, 0, 15, 0x00000011},
-      {0x28, 0, 0, 0x00000036},  {0x15, 12, 0, 0x000004d2},
-      {0x28, 0, 0, 0x00000038},  {0x15, 10, 11, 0x000004d2},
-      {0x15, 0, 10, 0x00000800}, {0x30, 0, 0, 0x00000017},
-      {0x15, 0, 8, 0x00000011},  {0x28, 0, 0, 0x00000014},
-      {0x45, 6, 0, 0x00001fff},  {0xb1, 0, 0, 0x0000000e},
-      {0x48, 0, 0, 0x0000000e},  {0x15, 2, 0, 0x000004d2},
-      {0x48, 0, 0, 0x00000010},  {0x15, 0, 1, 0x000004d2},
-      {0x6, 0, 0, 0x00040000},   {0x6, 0, 0, 0x00000000},
-  };
-  struct sock_fprog bpf = {
-      .len = ABSL_ARRAYSIZE(code),
-      .filter = code,
-  };
-  ASSERT_THAT(
-      setsockopt(sock_.get(), SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)),
-      SyscallSucceeds());
-
-  constexpr int val = 0;
-  ASSERT_THAT(
-      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
-      SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, SetSocketDetachFilterNoInstalledFilter) {
-  // TODO(gvisor.dev/2746): Support SO_ATTACH_FILTER/SO_DETACH_FILTER.
-  SKIP_IF(IsRunningOnGvisor());
-  constexpr int val = 0;
-  ASSERT_THAT(
-      setsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)),
-      SyscallFailsWithErrno(ENOENT));
-}
-
-TEST_P(UdpSocketTest, GetSocketDetachFilter) {
-  int val = 0;
-  socklen_t val_len = sizeof(val);
-  ASSERT_THAT(
-      getsockopt(sock_.get(), SOL_SOCKET, SO_DETACH_FILTER, &val, &val_len),
-      SyscallFailsWithErrno(ENOPROTOOPT));
-}
-
-#endif  // __fuchsia__
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h
deleted file mode 100644
index f7e25c805..000000000
--- a/test/syscalls/linux/udp_socket_test_cases.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
-#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
-
-#include <sys/socket.h>
-
-#include "gtest/gtest.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/util/file_descriptor.h"
-#include "test/util/posix_error.h"
-
-namespace gvisor {
-namespace testing {
-
-// The initial port to be be used on gvisor.
-constexpr int TestPort = 40000;
-
-// Fixture for tests parameterized by the address family to use (AF_INET and
-// AF_INET6) when creating sockets.
-class UdpSocketTest
-    : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
- protected:
-  // Creates two sockets that will be used by test cases.
-  void SetUp() override;
-
-  // Binds the socket bind_ to the loopback and updates bind_addr_.
-  PosixError BindLoopback();
-
-  // Binds the socket bind_ to Any and updates bind_addr_.
-  PosixError BindAny();
-
-  // Binds given socket to address addr and updates.
-  PosixError BindSocket(int socket, struct sockaddr* addr);
-
-  // Return initialized Any address to port 0.
-  struct sockaddr_storage InetAnyAddr();
-
-  // Return initialized Loopback address to port 0.
-  struct sockaddr_storage InetLoopbackAddr();
-
-  // Disconnects socket sockfd.
-  void Disconnect(int sockfd);
-
-  // Get family for the test.
-  int GetFamily();
-
-  // Socket used by Bind methods
-  FileDescriptor bind_;
-
-  // Second socket used for tests.
-  FileDescriptor sock_;
-
-  // Address for bind_ socket.
-  struct sockaddr* bind_addr_;
-
-  // Initialized to the length based on GetFamily().
-  socklen_t addrlen_;
-
-  // Storage for bind_addr_.
-  struct sockaddr_storage bind_addr_storage_;
-
- private:
-  // Helper to initialize addrlen_ for the test case.
-  socklen_t GetAddrLength();
-};
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
index 2040375c9..061e2e0f1 100644
--- a/test/syscalls/linux/unlink.cc
+++ b/test/syscalls/linux/unlink.cc
@@ -208,6 +208,20 @@ TEST(RmdirTest, CanRemoveWithTrailingSlashes) {
   ASSERT_THAT(rmdir(slashslash.c_str()), SyscallSucceeds());
 }
 
+TEST(UnlinkTest, UnlinkAtEmptyPath) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+  EXPECT_THAT(unlinkat(fd.get(), "", 0), SyscallFailsWithErrno(ENOENT));
+
+  auto dirInDir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  auto dirFD = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(dirInDir.path(), O_RDONLY | O_DIRECTORY, 0666));
+  EXPECT_THAT(unlinkat(dirFD.get(), "", AT_REMOVEDIR),
+              SyscallFailsWithErrno(ENOENT));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index ce1899f45..2a8699a7b 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -38,8 +38,6 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
   switch (info.param) {
     case CLOCK_MONOTONIC:
       return "CLOCK_MONOTONIC";
-    case CLOCK_REALTIME:
-      return "CLOCK_REALTIME";
     case CLOCK_BOOTTIME:
       return "CLOCK_BOOTTIME";
     default:
@@ -47,59 +45,36 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
   }
 }
 
-class CorrectVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
+class MonotonicVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
 
-TEST_P(CorrectVDSOClockTest, IsCorrect) {
+TEST_P(MonotonicVDSOClockTest, IsCorrect) {
+  // The VDSO implementation of clock_gettime() uses the TSC. On KVM, sentry and
+  // application TSCs can be very desynchronized; see
+  // sentry/platform/kvm/kvm.vCPU.setSystemTime().
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
+  // Check that when we alternate readings from the clock_gettime syscall and
+  // the VDSO's implementation, we observe the combined sequence as being
+  // monotonic.
   struct timespec tvdso, tsys;
   absl::Time vdso_time, sys_time;
-  uint64_t total_calls = 0;
-
-  // It is expected that 82.5% of clock_gettime calls will be less than 100us
-  // skewed from the system time.
-  // Unfortunately this is not only influenced by the VDSO clock skew, but also
-  // by arbitrary scheduling delays and the like. The test is therefore
-  // regularly disabled.
-  std::map<absl::Duration, std::tuple<double, uint64_t, uint64_t>> confidence =
-      {
-          {absl::Microseconds(100), std::make_tuple(0.825, 0, 0)},
-          {absl::Microseconds(250), std::make_tuple(0.94, 0, 0)},
-          {absl::Milliseconds(1), std::make_tuple(0.999, 0, 0)},
-      };
-
-  absl::Time start = absl::Now();
-  while (absl::Now() < start + absl::Seconds(30)) {
-    EXPECT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
-    EXPECT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
-                SyscallSucceeds());
-
+  ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+              SyscallSucceeds());
+  sys_time = absl::TimeFromTimespec(tsys);
+  auto end = absl::Now() + absl::Seconds(10);
+  while (absl::Now() < end) {
+    ASSERT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
     vdso_time = absl::TimeFromTimespec(tvdso);
-
-    for (auto const& conf : confidence) {
-      std::get<1>(confidence[conf.first]) +=
-          (sys_time - vdso_time) < conf.first;
-    }
-
+    EXPECT_LE(sys_time, vdso_time);
+    ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+                SyscallSucceeds());
     sys_time = absl::TimeFromTimespec(tsys);
-
-    for (auto const& conf : confidence) {
-      std::get<2>(confidence[conf.first]) +=
-          (vdso_time - sys_time) < conf.first;
-    }
-
-    ++total_calls;
-  }
-
-  for (auto const& conf : confidence) {
-    EXPECT_GE(std::get<1>(conf.second) / static_cast<double>(total_calls),
-              std::get<0>(conf.second));
-    EXPECT_GE(std::get<2>(conf.second) / static_cast<double>(total_calls),
-              std::get<0>(conf.second));
+    EXPECT_LE(vdso_time, sys_time);
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(ClockGettime, CorrectVDSOClockTest,
-                         ::testing::Values(CLOCK_MONOTONIC, CLOCK_REALTIME,
-                                           CLOCK_BOOTTIME),
+INSTANTIATE_TEST_SUITE_P(ClockGettime, MonotonicVDSOClockTest,
+                         ::testing::Values(CLOCK_MONOTONIC, CLOCK_BOOTTIME),
                          PrintClockId);
 
 }  // namespace
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 39b5b2f56..77bcfbb8a 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -133,6 +133,91 @@ TEST_F(WriteTest, WriteExceedsRLimit) {
   EXPECT_THAT(close(fd), SyscallSucceeds());
 }
 
+TEST_F(WriteTest, WriteIncrementOffset) {
+  TempPath tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor f =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path().c_str(), O_WRONLY));
+  int fd = f.get();
+
+  EXPECT_THAT(WriteBytes(fd, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  const int bytes_total = 1024;
+
+  EXPECT_THAT(WriteBytes(fd, bytes_total),
+              SyscallSucceedsWithValue(bytes_total));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(bytes_total));
+}
+
+TEST_F(WriteTest, WriteIncrementOffsetSeek) {
+  const std::string data = "hello world\n";
+  TempPath tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), data, TempPath::kDefaultFileMode));
+  FileDescriptor f =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path().c_str(), O_WRONLY));
+  int fd = f.get();
+
+  const int seek_offset = data.size() / 2;
+  ASSERT_THAT(lseek(fd, seek_offset, SEEK_SET),
+              SyscallSucceedsWithValue(seek_offset));
+
+  const int write_bytes = 512;
+  EXPECT_THAT(WriteBytes(fd, write_bytes),
+              SyscallSucceedsWithValue(write_bytes));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR),
+              SyscallSucceedsWithValue(seek_offset + write_bytes));
+}
+
+TEST_F(WriteTest, WriteIncrementOffsetAppend) {
+  const std::string data = "hello world\n";
+  TempPath tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), data, TempPath::kDefaultFileMode));
+  FileDescriptor f = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(tmpfile.path().c_str(), O_WRONLY | O_APPEND));
+  int fd = f.get();
+
+  EXPECT_THAT(WriteBytes(fd, 1024), SyscallSucceedsWithValue(1024));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR),
+              SyscallSucceedsWithValue(data.size() + 1024));
+}
+
+TEST_F(WriteTest, WriteIncrementOffsetEOF) {
+  const std::string data = "hello world\n";
+  const TempPath tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), data, TempPath::kDefaultFileMode));
+  FileDescriptor f =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path().c_str(), O_WRONLY));
+  int fd = f.get();
+
+  EXPECT_THAT(lseek(fd, 0, SEEK_END), SyscallSucceedsWithValue(data.size()));
+
+  EXPECT_THAT(WriteBytes(fd, 1024), SyscallSucceedsWithValue(1024));
+  EXPECT_THAT(lseek(fd, 0, SEEK_END),
+              SyscallSucceedsWithValue(data.size() + 1024));
+}
+
+TEST_F(WriteTest, PwriteNoChangeOffset) {
+  TempPath tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor f =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path().c_str(), O_WRONLY));
+  int fd = f.get();
+
+  const std::string data = "hello world\n";
+
+  EXPECT_THAT(pwrite(fd, data.data(), data.size(), 0),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  const int bytes_total = 1024;
+  ASSERT_THAT(WriteBytes(fd, bytes_total),
+              SyscallSucceedsWithValue(bytes_total));
+  ASSERT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(bytes_total));
+
+  EXPECT_THAT(pwrite(fd, data.data(), data.size(), bytes_total),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(bytes_total));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index cbcf08451..bd3f829c4 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -28,6 +28,7 @@
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -37,6 +38,8 @@ namespace testing {
 
 namespace {
 
+using ::gvisor::testing::IsTmpfs;
+
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNonexistentFile) {
@@ -229,7 +232,7 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
   EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+TEST_F(XattrTest, SetXattrSizeSmallerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -244,7 +247,7 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrZeroSize) {
+TEST_F(XattrTest, SetXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -256,7 +259,7 @@ TEST_F(XattrTest, SetxattrZeroSize) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, SetxattrSizeTooLarge) {
+TEST_F(XattrTest, SetXattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -271,7 +274,7 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+TEST_F(XattrTest, SetXattrNullValueAndNonzeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -280,7 +283,7 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+TEST_F(XattrTest, SetXattrNullValueAndZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -288,7 +291,7 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+TEST_F(XattrTest, SetXattrValueTooLargeButOKSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -304,7 +307,7 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+TEST_F(XattrTest, SetXattrReplaceWithSmaller) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -319,7 +322,7 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+TEST_F(XattrTest, SetXattrReplaceWithLarger) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -333,7 +336,7 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
   EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, SetxattrCreateFlag) {
+TEST_F(XattrTest, SetXattrCreateFlag) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -344,7 +347,7 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrReplaceFlag) {
+TEST_F(XattrTest, SetXattrReplaceFlag) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -356,14 +359,14 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, SetxattrInvalidFlags) {
+TEST_F(XattrTest, SetXattrInvalidFlags) {
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
               SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_F(XattrTest, Getxattr) {
+TEST_F(XattrTest, GetXattr) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -375,7 +378,7 @@ TEST_F(XattrTest, Getxattr) {
   EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+TEST_F(XattrTest, GetXattrSizeSmallerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -387,7 +390,7 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+TEST_F(XattrTest, GetXattrSizeLargerThanValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -402,7 +405,7 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, GetxattrZeroSize) {
+TEST_F(XattrTest, GetXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -415,7 +418,7 @@ TEST_F(XattrTest, GetxattrZeroSize) {
   EXPECT_EQ(buf, '-');
 }
 
-TEST_F(XattrTest, GetxattrSizeTooLarge) {
+TEST_F(XattrTest, GetXattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -431,7 +434,7 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
   EXPECT_EQ(buf, expected_buf);
 }
 
-TEST_F(XattrTest, GetxattrNullValue) {
+TEST_F(XattrTest, GetXattrNullValue) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -442,7 +445,7 @@ TEST_F(XattrTest, GetxattrNullValue) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+TEST_F(XattrTest, GetXattrNullValueAndZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -458,13 +461,13 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
 }
 
-TEST_F(XattrTest, GetxattrNonexistentName) {
+TEST_F(XattrTest, GetXattrNonexistentName) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, Listxattr) {
+TEST_F(XattrTest, ListXattr) {
   const char* path = test_file_name_.c_str();
   const std::string name = "user.test";
   const std::string name2 = "user.test2";
@@ -490,7 +493,7 @@ TEST_F(XattrTest, Listxattr) {
   EXPECT_EQ(got, expected);
 }
 
-TEST_F(XattrTest, ListxattrNoXattrs) {
+TEST_F(XattrTest, ListXattrNoXattrs) {
   const char* path = test_file_name_.c_str();
 
   std::vector<char> list, expected;
@@ -498,13 +501,13 @@ TEST_F(XattrTest, ListxattrNoXattrs) {
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(list, expected);
 
-  // Listxattr should succeed if there are no attributes, even if the buffer
+  // ListXattr should succeed if there are no attributes, even if the buffer
   // passed in is a nullptr.
   EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
               SyscallSucceedsWithValue(0));
 }
 
-TEST_F(XattrTest, ListxattrNullBuffer) {
+TEST_F(XattrTest, ListXattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -513,7 +516,7 @@ TEST_F(XattrTest, ListxattrNullBuffer) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-TEST_F(XattrTest, ListxattrSizeTooSmall) {
+TEST_F(XattrTest, ListXattrSizeTooSmall) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -523,7 +526,7 @@ TEST_F(XattrTest, ListxattrSizeTooSmall) {
               SyscallFailsWithErrno(ERANGE));
 }
 
-TEST_F(XattrTest, ListxattrZeroSize) {
+TEST_F(XattrTest, ListXattrZeroSize) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -604,6 +607,83 @@ TEST_F(XattrTest, XattrWithFD) {
   EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
+TEST_F(XattrTest, TrustedNamespaceWithCapSysAdmin) {
+  // Trusted namespace not supported in VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // TODO(b/66162845): Only gVisor tmpfs currently supports trusted namespace.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(test_file_name_)));
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.test";
+
+  // Writing to the trusted.* xattr namespace requires CAP_SYS_ADMIN in the root
+  // user namespace. There's no easy way to check that, other than trying the
+  // operation and seeing what happens. We'll call removexattr because it's
+  // simplest.
+  if (removexattr(path, name) < 0) {
+    SKIP_IF(errno == EPERM);
+    FAIL() << "unexpected errno from removexattr: " << errno;
+  }
+
+  // Set.
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // Get.
+  char got = '\0';
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(val, got);
+
+  // List.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  // Remove.
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
+
+  // Get should now return ENODATA.
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, TrustedNamespaceWithoutCapSysAdmin) {
+  // Trusted namespace not supported in VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // TODO(b/66162845): Only gVisor tmpfs currently supports trusted namespace.
+  SKIP_IF(IsRunningOnGvisor() &&
+          !ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs(test_file_name_)));
+
+  // Drop CAP_SYS_ADMIN if we have it.
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+    EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+  }
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.test";
+
+  // Set fails.
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+
+  // Get fails.
+  char got = '\0';
+  EXPECT_THAT(getxattr(path, name, &got, size), SyscallFailsWithErrno(ENODATA));
+
+  // List still works, but returns no items.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)), SyscallSucceedsWithValue(0));
+
+  // Remove fails.
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/util/BUILD b/test/util/BUILD
index 2a17c33ee..1b028a477 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "gbenchmark", "gtest", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "coreutil", "gbenchmark", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -46,6 +46,13 @@ cc_library(
 )
 
 cc_library(
+    name = "fuse_util",
+    testonly = 1,
+    srcs = ["fuse_util.cc"],
+    hdrs = ["fuse_util.h"],
+)
+
+cc_library(
     name = "proc_util",
     testonly = 1,
     srcs = ["proc_util.cc"],
@@ -148,6 +155,10 @@ cc_library(
     ],
     hdrs = ["save_util.h"],
     defines = select_system(),
+    deps = [
+        ":logging",
+        "@com_google_absl//absl/types:optional",
+    ],
 )
 
 cc_library(
@@ -247,7 +258,7 @@ cc_library(
     ],
     hdrs = ["test_util.h"],
     defines = select_system(),
-    deps = [
+    deps = coreutil() + [
         ":fs_util",
         ":logging",
         ":posix_error",
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 5418948fe..b16055dd8 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -15,7 +15,11 @@
 #include "test/util/fs_util.h"
 
 #include <dirent.h>
+#ifdef __linux__
+#include <linux/magic.h>
+#endif  // __linux__
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -629,5 +633,35 @@ PosixErrorOr<std::string> ProcessExePath(int pid) {
   return ReadLink(absl::StrCat("/proc/", pid, "/exe"));
 }
 
+#ifdef __linux__
+PosixErrorOr<bool> IsTmpfs(const std::string& path) {
+  struct statfs stat;
+  if (statfs(path.c_str(), &stat)) {
+    if (errno == ENOENT) {
+      // Nothing at path, don't raise this as an error. Instead, just report no
+      // tmpfs at path.
+      return false;
+    }
+    return PosixError(errno,
+                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+  }
+  return stat.f_type == TMPFS_MAGIC;
+}
+#endif  // __linux__
+
+PosixErrorOr<bool> IsOverlayfs(const std::string& path) {
+  struct statfs stat;
+  if (statfs(path.c_str(), &stat)) {
+    if (errno == ENOENT) {
+      // Nothing at path, don't raise this as an error. Instead, just report no
+      // overlayfs at path.
+      return false;
+    }
+    return PosixError(errno,
+                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+  }
+  return stat.f_type == OVERLAYFS_SUPER_MAGIC;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index 8cdac23a1..c99cf5eb7 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -17,6 +17,7 @@
 
 #include <dirent.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -37,6 +38,10 @@ constexpr int kOLargeFile = 00400000;
 #error "Unknown architecture"
 #endif
 
+// From linux/magic.h. For some reason, not defined in the headers for some
+// build environments.
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+
 // Returns a status or the current working directory.
 PosixErrorOr<std::string> GetCWD();
 
@@ -178,6 +183,14 @@ std::string CleanPath(absl::string_view path);
 // Returns the full path to the executable of the given pid or a PosixError.
 PosixErrorOr<std::string> ProcessExePath(int pid);
 
+#ifdef __linux__
+// IsTmpfs returns true if the file at path is backed by tmpfs.
+PosixErrorOr<bool> IsTmpfs(const std::string& path);
+#endif  // __linux__
+
+// IsOverlayfs returns true if the file at path is backed by overlayfs.
+PosixErrorOr<bool> IsOverlayfs(const std::string& path);
+
 namespace internal {
 // Not part of the public API.
 std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
diff --git a/test/util/fuse_util.cc b/test/util/fuse_util.cc
new file mode 100644
index 000000000..027f8386c
--- /dev/null
+++ b/test/util/fuse_util.cc
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/fuse_util.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <string>
+
+namespace gvisor {
+namespace testing {
+
+// Create a default FuseAttr struct with specified mode, inode, and size.
+fuse_attr DefaultFuseAttr(mode_t mode, uint64_t inode, uint64_t size) {
+  const int time_sec = 1595436289;
+  const int time_nsec = 134150844;
+  return (struct fuse_attr){
+      .ino = inode,
+      .size = size,
+      .blocks = 4,
+      .atime = time_sec,
+      .mtime = time_sec,
+      .ctime = time_sec,
+      .atimensec = time_nsec,
+      .mtimensec = time_nsec,
+      .ctimensec = time_nsec,
+      .mode = mode,
+      .nlink = 2,
+      .uid = 1234,
+      .gid = 4321,
+      .rdev = 12,
+      .blksize = 4096,
+  };
+}
+
+// Create response body with specified mode, nodeID, and size.
+fuse_entry_out DefaultEntryOut(mode_t mode, uint64_t node_id, uint64_t size) {
+  struct fuse_entry_out default_entry_out = {
+      .nodeid = node_id,
+      .generation = 0,
+      .entry_valid = 0,
+      .attr_valid = 0,
+      .entry_valid_nsec = 0,
+      .attr_valid_nsec = 0,
+      .attr = DefaultFuseAttr(mode, node_id, size),
+  };
+  return default_entry_out;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/fuse_util.h b/test/util/fuse_util.h
new file mode 100644
index 000000000..544fe1b38
--- /dev/null
+++ b/test/util/fuse_util.h
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_FUSE_UTIL_H_
+#define GVISOR_TEST_UTIL_FUSE_UTIL_H_
+
+#include <linux/fuse.h>
+#include <sys/uio.h>
+
+#include <string>
+#include <vector>
+
+namespace gvisor {
+namespace testing {
+
+// The fundamental generation function with a single argument. If passed by
+// std::string or std::vector<char>, it will call specialized versions as
+// implemented below.
+template <typename T>
+std::vector<struct iovec> FuseGenerateIovecs(T &first) {
+  return {(struct iovec){.iov_base = &first, .iov_len = sizeof(first)}};
+}
+
+// If an argument is of type std::string, it must be used in read-only scenario.
+// Because we are setting up iovec, which contains the original address of a
+// data structure, we have to drop const qualification. Usually used with
+// variable-length payload data.
+template <typename T = std::string>
+std::vector<struct iovec> FuseGenerateIovecs(std::string &first) {
+  // Pad one byte for null-terminate c-string.
+  return {(struct iovec){.iov_base = const_cast<char *>(first.c_str()),
+                         .iov_len = first.size() + 1}};
+}
+
+// If an argument is of type std::vector<char>, it must be used in write-only
+// scenario and the size of the variable must be greater than or equal to the
+// size of the expected data. Usually used with variable-length payload data.
+template <typename T = std::vector<char>>
+std::vector<struct iovec> FuseGenerateIovecs(std::vector<char> &first) {
+  return {(struct iovec){.iov_base = first.data(), .iov_len = first.size()}};
+}
+
+// A helper function to set up an array of iovec struct for testing purpose.
+// Use variadic class template to generalize different numbers and different
+// types of FUSE structs.
+template <typename T, typename... Types>
+std::vector<struct iovec> FuseGenerateIovecs(T &first, Types &...args) {
+  auto first_iovec = FuseGenerateIovecs(first);
+  auto iovecs = FuseGenerateIovecs(args...);
+  first_iovec.insert(std::end(first_iovec), std::begin(iovecs),
+                     std::end(iovecs));
+  return first_iovec;
+}
+
+// Create a fuse_attr filled with the specified mode and inode.
+fuse_attr DefaultFuseAttr(mode_t mode, uint64_t inode, uint64_t size = 512);
+
+// Return a fuse_entry_out FUSE server response body.
+fuse_entry_out DefaultEntryOut(mode_t mode, uint64_t node_id,
+                               uint64_t size = 512);
+
+}  // namespace testing
+}  // namespace gvisor
+#endif  // GVISOR_TEST_UTIL_FUSE_UTIL_H_
diff --git a/test/util/posix_error.cc b/test/util/posix_error.cc
index cebf7e0ac..deed0c05b 100644
--- a/test/util/posix_error.cc
+++ b/test/util/posix_error.cc
@@ -87,7 +87,7 @@ bool PosixErrorIsMatcherCommonImpl::MatchAndExplain(
     return false;
   }
 
-  if (!message_matcher_.Matches(error.error_message())) {
+  if (!message_matcher_.Matches(error.message())) {
     return false;
   }
 
diff --git a/test/util/posix_error.h b/test/util/posix_error.h
index ad666bce0..b634a7f78 100644
--- a/test/util/posix_error.h
+++ b/test/util/posix_error.h
@@ -26,11 +26,6 @@
 namespace gvisor {
 namespace testing {
 
-class PosixErrorIsMatcherCommonImpl;
-
-template <typename T>
-class PosixErrorOr;
-
 class ABSL_MUST_USE_RESULT PosixError {
  public:
   PosixError() {}
@@ -49,7 +44,8 @@ class ABSL_MUST_USE_RESULT PosixError {
   // PosixErrorOr.
   const PosixError& error() const { return *this; }
 
-  std::string error_message() const { return msg_; }
+  int errno_value() const { return errno_; }
+  std::string message() const { return msg_; }
 
   // ToString produces a full string representation of this posix error
   // including the printable representation of the errno and the error message.
@@ -61,14 +57,8 @@ class ABSL_MUST_USE_RESULT PosixError {
   void IgnoreError() const {}
 
  private:
-  int errno_value() const { return errno_; }
   int errno_ = 0;
   std::string msg_;
-
-  friend class PosixErrorIsMatcherCommonImpl;
-
-  template <typename T>
-  friend class PosixErrorOr;
 };
 
 template <typename T>
@@ -94,15 +84,12 @@ class ABSL_MUST_USE_RESULT PosixErrorOr {
   template <typename U>
   PosixErrorOr& operator=(PosixErrorOr<U> other);
 
-  // Return a reference to the error or NoError().
-  PosixError error() const;
-
-  // Returns this->error().error_message();
-  std::string error_message() const;
-
   // Returns true if this PosixErrorOr contains some T.
   bool ok() const;
 
+  // Return a copy of the contained PosixError or NoError().
+  PosixError error() const;
+
   // Returns a reference to our current value, or CHECK-fails if !this->ok().
   const T& ValueOrDie() const&;
   T& ValueOrDie() &;
@@ -115,7 +102,6 @@ class ABSL_MUST_USE_RESULT PosixErrorOr {
   void IgnoreError() const {}
 
  private:
-  int errno_value() const;
   absl::variant<T, PosixError> value_;
 
   friend class PosixErrorIsMatcherCommonImpl;
@@ -171,16 +157,6 @@ PosixError PosixErrorOr<T>::error() const {
 }
 
 template <typename T>
-int PosixErrorOr<T>::errno_value() const {
-  return error().errno_value();
-}
-
-template <typename T>
-std::string PosixErrorOr<T>::error_message() const {
-  return error().error_message();
-}
-
-template <typename T>
 bool PosixErrorOr<T>::ok() const {
   return absl::holds_alternative<T>(value_);
 }
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
index c01f916aa..2cf0bea74 100644
--- a/test/util/pty_util.cc
+++ b/test/util/pty_util.cc
@@ -23,15 +23,15 @@
 namespace gvisor {
 namespace testing {
 
-PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
-  PosixErrorOr<int> n = SlaveID(master);
+PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& master) {
+  PosixErrorOr<int> n = ReplicaID(master);
   if (!n.ok()) {
     return PosixErrorOr<FileDescriptor>(n.error());
   }
   return Open(absl::StrCat("/dev/pts/", n.ValueOrDie()), O_RDWR | O_NONBLOCK);
 }
 
-PosixErrorOr<int> SlaveID(const FileDescriptor& master) {
+PosixErrorOr<int> ReplicaID(const FileDescriptor& master) {
   // Get pty index.
   int n;
   int ret = ioctl(master.get(), TIOCGPTN, &n);
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
index 0722da379..ed7658868 100644
--- a/test/util/pty_util.h
+++ b/test/util/pty_util.h
@@ -21,11 +21,11 @@
 namespace gvisor {
 namespace testing {
 
-// Opens the slave end of the passed master as R/W and nonblocking.
-PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master);
+// Opens the replica end of the passed master as R/W and nonblocking.
+PosixErrorOr<FileDescriptor> OpenReplica(const FileDescriptor& master);
 
-// Get the number of the slave end of the master.
-PosixErrorOr<int> SlaveID(const FileDescriptor& master);
+// Get the number of the replica end of the master.
+PosixErrorOr<int> ReplicaID(const FileDescriptor& master);
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/save_util.cc b/test/util/save_util.cc
index 384d626f0..59d47e06e 100644
--- a/test/util/save_util.cc
+++ b/test/util/save_util.cc
@@ -21,35 +21,47 @@
 #include <atomic>
 #include <cerrno>
 
-#define GVISOR_COOPERATIVE_SAVE_TEST "GVISOR_COOPERATIVE_SAVE_TEST"
+#include "absl/types/optional.h"
 
 namespace gvisor {
 namespace testing {
 namespace {
 
-enum class CooperativeSaveMode {
-  kUnknown = 0,  // cooperative_save_mode is statically-initialized to 0
-  kAvailable,
-  kNotAvailable,
-};
-
-std::atomic<CooperativeSaveMode> cooperative_save_mode;
-
-bool CooperativeSaveEnabled() {
-  auto mode = cooperative_save_mode.load();
-  if (mode == CooperativeSaveMode::kUnknown) {
-    mode = (getenv(GVISOR_COOPERATIVE_SAVE_TEST) != nullptr)
-               ? CooperativeSaveMode::kAvailable
-               : CooperativeSaveMode::kNotAvailable;
-    cooperative_save_mode.store(mode);
+std::atomic<absl::optional<bool>> cooperative_save_present;
+std::atomic<absl::optional<bool>> random_save_present;
+
+bool CooperativeSavePresent() {
+  auto present = cooperative_save_present.load();
+  if (!present.has_value()) {
+    present = getenv("GVISOR_COOPERATIVE_SAVE_TEST") != nullptr;
+    cooperative_save_present.store(present);
+  }
+  return present.value();
+}
+
+bool RandomSavePresent() {
+  auto present = random_save_present.load();
+  if (!present.has_value()) {
+    present = getenv("GVISOR_RANDOM_SAVE_TEST") != nullptr;
+    random_save_present.store(present);
   }
-  return mode == CooperativeSaveMode::kAvailable;
+  return present.value();
 }
 
 std::atomic<int> save_disable;
 
 }  // namespace
 
+bool IsRunningWithSaveRestore() {
+  return CooperativeSavePresent() || RandomSavePresent();
+}
+
+void MaybeSave() {
+  if (CooperativeSavePresent() && save_disable.load() == 0) {
+    internal::DoCooperativeSave();
+  }
+}
+
 DisableSave::DisableSave() { save_disable++; }
 
 DisableSave::~DisableSave() { reset(); }
@@ -61,11 +73,5 @@ void DisableSave::reset() {
   }
 }
 
-namespace internal {
-bool ShouldSave() {
-  return CooperativeSaveEnabled() && (save_disable.load() == 0);
-}
-}  // namespace internal
-
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/save_util.h b/test/util/save_util.h
index bddad6120..e7218ae88 100644
--- a/test/util/save_util.h
+++ b/test/util/save_util.h
@@ -17,9 +17,17 @@
 
 namespace gvisor {
 namespace testing {
-// Disable save prevents saving while the given function executes.
+
+// Returns true if the environment in which the calling process is executing
+// allows the test to be checkpointed and restored during execution.
+bool IsRunningWithSaveRestore();
+
+// May perform a co-operative save cycle.
 //
-// This lasts the duration of the object, unless reset is called.
+// errno is guaranteed to be preserved.
+void MaybeSave();
+
+// Causes MaybeSave to become a no-op until destroyed or reset.
 class DisableSave {
  public:
   DisableSave();
@@ -37,13 +45,13 @@ class DisableSave {
   bool reset_ = false;
 };
 
-// May perform a co-operative save cycle.
+namespace internal {
+
+// Causes a co-operative save cycle to occur.
 //
 // errno is guaranteed to be preserved.
-void MaybeSave();
+void DoCooperativeSave();
 
-namespace internal {
-bool ShouldSave();
 }  // namespace internal
 
 }  // namespace testing
diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc
index d0aea8e6a..57431b3ea 100644
--- a/test/util/save_util_linux.cc
+++ b/test/util/save_util_linux.cc
@@ -30,20 +30,20 @@
 
 namespace gvisor {
 namespace testing {
-
-void MaybeSave() {
-  if (internal::ShouldSave()) {
-    int orig_errno = errno;
-    // We use it to trigger saving the sentry state
-    // when this syscall is called.
-    // Notice: this needs to be a valid syscall
-    // that is not used in any of the syscall tests.
-    syscall(SYS_TRIGGER_SAVE, nullptr, 0);
-    errno = orig_errno;
-  }
+namespace internal {
+
+void DoCooperativeSave() {
+  int orig_errno = errno;
+  // We use it to trigger saving the sentry state
+  // when this syscall is called.
+  // Notice: this needs to be a valid syscall
+  // that is not used in any of the syscall tests.
+  syscall(SYS_TRIGGER_SAVE, nullptr, 0);
+  errno = orig_errno;
 }
 
+}  // namespace internal
 }  // namespace testing
 }  // namespace gvisor
 
-#endif
+#endif  // __linux__
diff --git a/test/util/save_util_other.cc b/test/util/save_util_other.cc
index 931af2c29..7749ded76 100644
--- a/test/util/save_util_other.cc
+++ b/test/util/save_util_other.cc
@@ -14,13 +14,17 @@
 
 #ifndef __linux__
 
+#include "test/util/logging.h"
+
 namespace gvisor {
 namespace testing {
+namespace internal {
 
-void MaybeSave() {
-  // Saving is never available in a non-linux environment.
+void DoCooperativeSave() {
+  TEST_CHECK_MSG(false, "DoCooperativeSave not implemented");
 }
 
+}  // namespace internal
 }  // namespace testing
 }  // namespace gvisor
 
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
index 694d21692..7210094eb 100644
--- a/test/util/test_util_runfiles.cc
+++ b/test/util/test_util_runfiles.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef __fuchsia__
-
 #include <iostream>
 #include <string>
 
@@ -46,5 +44,3 @@ std::string RunfilePath(std::string path) {
 
 }  // namespace testing
 }  // namespace gvisor
-
-#endif  // __fuchsia__
diff --git a/test/util/timer_util.cc b/test/util/timer_util.cc
index 43a26b0d3..75cfc4f40 100644
--- a/test/util/timer_util.cc
+++ b/test/util/timer_util.cc
@@ -23,5 +23,23 @@ absl::Time Now(clockid_t id) {
   return absl::TimeFromTimespec(now);
 }
 
+#ifdef __linux__
+
+PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
+                                        const struct sigevent& sev) {
+  int timerid;
+  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+  if (ret < 0) {
+    return PosixError(errno, "timer_create");
+  }
+  if (ret > 0) {
+    return PosixError(EINVAL, "timer_create should never return positive");
+  }
+  MaybeSave();
+  return IntervalTimer(timerid);
+}
+
+#endif  // __linux__
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/timer_util.h b/test/util/timer_util.h
index 31aea4fc6..926e6632f 100644
--- a/test/util/timer_util.h
+++ b/test/util/timer_util.h
@@ -16,6 +16,9 @@
 #define GVISOR_TEST_UTIL_TIMER_UTIL_H_
 
 #include <errno.h>
+#ifdef __linux__
+#include <sys/syscall.h>
+#endif
 #include <sys/time.h>
 
 #include <functional>
@@ -30,6 +33,9 @@
 namespace gvisor {
 namespace testing {
 
+// Returns the current time.
+absl::Time Now(clockid_t id);
+
 // MonotonicTimer is a simple timer that uses a monotonic clock.
 class MonotonicTimer {
  public:
@@ -65,8 +71,92 @@ inline PosixErrorOr<Cleanup> ScopedItimer(int which,
   }));
 }
 
-// Returns the current time.
-absl::Time Now(clockid_t id);
+#ifdef __linux__
+
+// RAII type for a kernel "POSIX" interval timer. (The kernel provides system
+// calls such as timer_create that behave very similarly, but not identically,
+// to those described by timer_create(2); in particular, the kernel does not
+// implement SIGEV_THREAD. glibc builds POSIX-compliant interval timers based on
+// these kernel interval timers.)
+//
+// Compare implementation to FileDescriptor.
+class IntervalTimer {
+ public:
+  IntervalTimer() = default;
+
+  explicit IntervalTimer(int id) { set_id(id); }
+
+  IntervalTimer(IntervalTimer&& orig) : id_(orig.release()) {}
+
+  IntervalTimer& operator=(IntervalTimer&& orig) {
+    if (this == &orig) return *this;
+    reset(orig.release());
+    return *this;
+  }
+
+  IntervalTimer(const IntervalTimer& other) = delete;
+  IntervalTimer& operator=(const IntervalTimer& other) = delete;
+
+  ~IntervalTimer() { reset(); }
+
+  int get() const { return id_; }
+
+  int release() {
+    int const id = id_;
+    id_ = -1;
+    return id;
+  }
+
+  void reset() { reset(-1); }
+
+  void reset(int id) {
+    if (id_ >= 0) {
+      TEST_PCHECK(syscall(SYS_timer_delete, id_) == 0);
+      MaybeSave();
+    }
+    set_id(id);
+  }
+
+  PosixErrorOr<struct itimerspec> Set(
+      int flags, const struct itimerspec& new_value) const {
+    struct itimerspec old_value = {};
+    if (syscall(SYS_timer_settime, id_, flags, &new_value, &old_value) < 0) {
+      return PosixError(errno, "timer_settime");
+    }
+    MaybeSave();
+    return old_value;
+  }
+
+  PosixErrorOr<struct itimerspec> Get() const {
+    struct itimerspec curr_value = {};
+    if (syscall(SYS_timer_gettime, id_, &curr_value) < 0) {
+      return PosixError(errno, "timer_gettime");
+    }
+    MaybeSave();
+    return curr_value;
+  }
+
+  PosixErrorOr<int> Overruns() const {
+    int rv = syscall(SYS_timer_getoverrun, id_);
+    if (rv < 0) {
+      return PosixError(errno, "timer_getoverrun");
+    }
+    MaybeSave();
+    return rv;
+  }
+
+ private:
+  void set_id(int id) { id_ = std::max(id, -1); }
+
+  // Kernel timer_t is int; glibc timer_t is void*.
+  int id_ = -1;
+};
+
+// A wrapper around timer_create(2).
+PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
+                                        const struct sigevent& sev);
+
+#endif  // __linux__
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/tools/BUILD b/tools/BUILD
index da83877b1..faf310676 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -5,5 +5,7 @@ package(licenses = ["notice"])
 bzl_library(
     name = "defs_bzl",
     srcs = ["defs.bzl"],
-    visibility = ["//visibility:private"],
+    visibility = [
+        "//:sandbox",
+    ],
 )
diff --git a/tools/bazel.mk b/tools/bazel.mk
index 3e27af7d1..88431ce66 100644
--- a/tools/bazel.mk
+++ b/tools/bazel.mk
@@ -14,11 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Make hacks.
+EMPTY :=
+SPACE := $(EMPTY) $(EMPTY)
+
 # See base Makefile.
 SHELL=/bin/bash -o pipefail
 BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \
 			git rev-parse --abbrev-ref HEAD 2>/dev/null) | \
 			xargs -n 1 basename 2>/dev/null)
+BUILD_ROOTS := bazel-bin/ bazel-out/
 
 # Bazel container configuration (see below).
 USER ?= gvisor
@@ -31,6 +36,7 @@ DOCKER_PRIVILEGED ?= --privileged
 BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
 GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
 DOCKER_SOCKET := /var/run/docker.sock
+DOCKER_CONFIG := /etc/docker/daemon.json
 
 # Bazel flags.
 BAZEL := bazel $(STARTUP_OPTIONS)
@@ -56,6 +62,9 @@ endif
 # Add docker passthrough options.
 ifneq ($(DOCKER_PRIVILEGED),)
 FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
+# TODO(gvisor.dev/issue/1624): Remove docker config volume. This is required
+# temporarily for checking VFS1 vs VFS2 by some tests.
+FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_CONFIG):$(DOCKER_CONFIG)"
 FULL_DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED)
 FULL_DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED)
 DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET))
@@ -127,7 +136,7 @@ bazel-server-start: bazel-image ## Starts the bazel server.
 		--workdir "$(CURDIR)" \
 		$(FULL_DOCKER_RUN_OPTIONS) \
 		$(BUILDER_IMAGE) \
-		sh -c "tail -f --pid=\$$($(BAZEL) info server_pid)"
+		sh -c "tail -f --pid=\$$($(BAZEL) info server_pid) /dev/null"
 .PHONY: bazel-server-start
 
 bazel-shutdown: ## Shuts down a running bazel server.
@@ -147,9 +156,12 @@ build_cmd = docker exec $(FULL_DOCKER_EXEC_OPTIONS) $(DOCKER_NAME) sh -o pipefai
 
 build_paths = $(build_cmd) 2>&1 \
 		| tee /proc/self/fd/2 \
-		| grep -E "^  bazel-bin/" \
-		| tr -d '\r' \
+		| grep -A1 -E '^Target' \
+		| grep -E '^  ($(subst $(SPACE),|,$(BUILD_ROOTS)))' \
+		| sed "s/ /\n/g" \
+		| strings -n 10 \
 		| awk '{$$1=$$1};1' \
+		| xargs -n 1 -I {} readlink -f "{}" \
 		| xargs -n 1 -I {} sh -c "$(1)"
 
 build: bazel-server
diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD
index 8d4356119..d043caf06 100644
--- a/tools/bazeldefs/BUILD
+++ b/tools/bazeldefs/BUILD
@@ -26,43 +26,6 @@ rbe_platform(
     remote_execution_properties = """
         properties: {
           name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
-        }
-        properties: {
-          name: "dockerAddCapabilities"
-          value: "SYS_ADMIN"
-        }
-        properties: {
-          name: "dockerPrivileged"
-          value: "true"
-        }
-    """,
-)
-
-rbe_toolchain(
-    name = "cc-toolchain-clang-x86_64-default",
-    exec_compatible_with = [],
-    tags = [
-        "manual",
-    ],
-    target_compatible_with = [],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-# Updated versions of the above, compatible with bazel3.
-rbe_platform(
-    name = "rbe_ubuntu1604_bazel3",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains_bazel3//constraints:xenial",
-        "@bazel_toolchains_bazel3//constraints/sanitizers:support_msan",
-    ],
-    remote_execution_properties = """
-        properties: {
-          name: "container-image"
           value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:b516a2d69537cb40a7c6a7d92d0008abb29fba8725243772bdaf2c83f1be2272"
         }
         properties: {
@@ -77,13 +40,13 @@ rbe_platform(
 )
 
 rbe_toolchain(
-    name = "cc-toolchain-clang-x86_64-default_bazel3",
+    name = "cc-toolchain-clang-x86_64-default",
     exec_compatible_with = [],
     tags = [
         "manual",
     ],
     target_compatible_with = [],
-    toolchain = "@bazel_toolchains_bazel3//configs/ubuntu16_04_clang/11.0.0/bazel_3.1.0/cc:cc-compiler-k8",
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/11.0.0/bazel_3.1.0/cc:cc-compiler-k8",
     toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
 )
 
diff --git a/tools/bazeldefs/cc.bzl b/tools/bazeldefs/cc.bzl
new file mode 100644
index 000000000..7f41a0142
--- /dev/null
+++ b/tools/bazeldefs/cc.bzl
@@ -0,0 +1,43 @@
+"""C++ rules."""
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
+load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
+load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library")
+
+cc_library = _cc_library
+cc_flags_supplier = _cc_flags_supplier
+cc_proto_library = _cc_proto_library
+cc_test = _cc_test
+cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
+gtest = "@com_google_googletest//:gtest"
+gbenchmark = "@com_google_benchmark//:benchmark"
+grpcpp = "@com_github_grpc_grpc//:grpc++"
+vdso_linker_option = "-fuse-ld=gold "
+
+def cc_grpc_library(name, **kwargs):
+    _cc_grpc_library(name = name, grpc_only = True, **kwargs)
+
+def cc_binary(name, static = False, **kwargs):
+    """Run cc_binary.
+
+    Args:
+        name: name of the target.
+        static: make a static binary if True
+        **kwargs: the rest of the args.
+    """
+    if static:
+        # How to statically link a c++ program that uses threads, like for gRPC:
+        # https://gcc.gnu.org/legacy-ml/gcc-help/2010-05/msg00029.html
+        if "linkopts" not in kwargs:
+            kwargs["linkopts"] = []
+        kwargs["linkopts"] += [
+            "-static",
+            "-lstdc++",
+            "-Wl,--whole-archive",
+            "-lpthread",
+            "-Wl,--no-whole-archive",
+        ]
+    _cc_binary(
+        name = name,
+        **kwargs
+    )
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index db7f379b8..ba186aace 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -1,35 +1,13 @@
-"""Bazel implementations of standard rules."""
+"""Meta and miscellaneous rules."""
 
-load("@bazel_gazelle//:def.bzl", _gazelle = "gazelle")
 load("@bazel_skylib//rules:build_test.bzl", _build_test = "build_test")
 load("@bazel_skylib//:bzl_library.bzl", _bzl_library = "bzl_library")
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
-load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _go_context = "go_context", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_path = "go_path", _go_test = "go_test")
-load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
-load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
-load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
-load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library")
 
 build_test = _build_test
 bzl_library = _bzl_library
-cc_library = _cc_library
-cc_flags_supplier = _cc_flags_supplier
-cc_proto_library = _cc_proto_library
-cc_test = _cc_test
-cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
-gazelle = _gazelle
-go_embed_data = _go_embed_data
-go_path = _go_path
-gtest = "@com_google_googletest//:gtest"
-grpcpp = "@com_github_grpc_grpc//:grpc++"
-gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
-pkg_deb = _pkg_deb
-pkg_tar = _pkg_tar
-py_binary = native.py_binary
 rbe_platform = native.platform
 rbe_toolchain = native.toolchain
-vdso_linker_option = "-fuse-ld=gold "
 
 def short_path(path):
     return path
@@ -40,128 +18,6 @@ def proto_library(name, has_services = None, **kwargs):
         **kwargs
     )
 
-def cc_grpc_library(name, **kwargs):
-    _cc_grpc_library(name = name, grpc_only = True, **kwargs)
-
-def _go_proto_or_grpc_library(go_library_func, name, **kwargs):
-    deps = [
-        dep.replace("_proto", "_go_proto")
-        for dep in (kwargs.pop("deps", []) or [])
-    ]
-    go_library_func(
-        name = name + "_go_proto",
-        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name + "_go_proto",
-        proto = ":" + name + "_proto",
-        deps = deps,
-        **kwargs
-    )
-
-def go_proto_library(name, **kwargs):
-    _go_proto_or_grpc_library(_go_proto_library, name, **kwargs)
-
-def go_grpc_and_proto_libraries(name, **kwargs):
-    _go_proto_or_grpc_library(_go_grpc_library, name, **kwargs)
-
-def cc_binary(name, static = False, **kwargs):
-    """Run cc_binary.
-
-    Args:
-        name: name of the target.
-        static: make a static binary if True
-        **kwargs: the rest of the args.
-    """
-    if static:
-        # How to statically link a c++ program that uses threads, like for gRPC:
-        # https://gcc.gnu.org/legacy-ml/gcc-help/2010-05/msg00029.html
-        if "linkopts" not in kwargs:
-            kwargs["linkopts"] = []
-        kwargs["linkopts"] += [
-            "-static",
-            "-lstdc++",
-            "-Wl,--whole-archive",
-            "-lpthread",
-            "-Wl,--no-whole-archive",
-        ]
-    _cc_binary(
-        name = name,
-        **kwargs
-    )
-
-def go_binary(name, static = False, pure = False, **kwargs):
-    """Build a go binary.
-
-    Args:
-        name: name of the target.
-        static: build a static binary.
-        pure: build without cgo.
-        **kwargs: rest of the arguments are passed to _go_binary.
-    """
-    if static:
-        kwargs["static"] = "on"
-    if pure:
-        kwargs["pure"] = "on"
-    _go_binary(
-        name = name,
-        **kwargs
-    )
-
-def go_importpath(target):
-    """Returns the importpath for the target."""
-    return target[GoLibrary].importpath
-
-def go_library(name, **kwargs):
-    _go_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name(),
-        **kwargs
-    )
-
-def go_test(name, pure = False, library = None, **kwargs):
-    """Build a go test.
-
-    Args:
-        name: name of the output binary.
-        pure: should it be built without cgo.
-        library: the library to embed.
-        **kwargs: rest of the arguments to pass to _go_test.
-    """
-    if pure:
-        kwargs["pure"] = "on"
-    if library:
-        kwargs["embed"] = [library]
-    _go_test(
-        name = name,
-        **kwargs
-    )
-
-def go_rule(rule, implementation, **kwargs):
-    """Wraps a rule definition with Go attributes.
-
-    Args:
-      rule: rule function (typically rule or aspect).
-      implementation: implementation function.
-      **kwargs: other arguments to pass to rule.
-
-    Returns:
-        The result of invoking the rule.
-    """
-    attrs = kwargs.pop("attrs", [])
-    attrs["_go_context_data"] = attr.label(default = "@io_bazel_rules_go//:go_context_data")
-    attrs["_stdlib"] = attr.label(default = "@io_bazel_rules_go//:stdlib")
-    toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
-    return rule(implementation, attrs = attrs, toolchains = toolchains, **kwargs)
-
-def go_context(ctx):
-    go_ctx = _go_context(ctx)
-    return struct(
-        go = go_ctx.go,
-        env = go_ctx.env,
-        runfiles = depset([go_ctx.go] + go_ctx.sdk.tools + go_ctx.stdlib.libs),
-        goos = go_ctx.sdk.goos,
-        goarch = go_ctx.sdk.goarch,
-        tags = go_ctx.tags,
-    )
-
 def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
     values = {
         "@bazel_tools//src/conditions:linux_x86_64": amd64,
@@ -179,3 +35,6 @@ def default_installer():
 
 def default_net_util():
     return []  # Nothing needed.
+
+def coreutil():
+    return []  # Nothing needed.
diff --git a/tools/bazeldefs/go.bzl b/tools/bazeldefs/go.bzl
new file mode 100644
index 000000000..d388346a5
--- /dev/null
+++ b/tools/bazeldefs/go.bzl
@@ -0,0 +1,142 @@
+"""Go rules."""
+
+load("@bazel_gazelle//:def.bzl", _gazelle = "gazelle")
+load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _go_context = "go_context", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_path = "go_path", _go_test = "go_test")
+load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
+load("//tools/bazeldefs:defs.bzl", "select_arch", "select_system")
+
+gazelle = _gazelle
+go_embed_data = _go_embed_data
+go_path = _go_path
+
+def _go_proto_or_grpc_library(go_library_func, name, **kwargs):
+    deps = [
+        dep.replace("_proto", "_go_proto")
+        for dep in (kwargs.pop("deps", []) or [])
+    ]
+    go_library_func(
+        name = name + "_go_proto",
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name + "_go_proto",
+        proto = ":" + name + "_proto",
+        deps = deps,
+        **kwargs
+    )
+
+def go_proto_library(name, **kwargs):
+    _go_proto_or_grpc_library(_go_proto_library, name, **kwargs)
+
+def go_grpc_and_proto_libraries(name, **kwargs):
+    _go_proto_or_grpc_library(_go_grpc_library, name, **kwargs)
+
+def go_binary(name, static = False, pure = False, x_defs = None, **kwargs):
+    """Build a go binary.
+
+    Args:
+        name: name of the target.
+        static: build a static binary.
+        pure: build without cgo.
+        x_defs: additional definitions.
+        **kwargs: rest of the arguments are passed to _go_binary.
+    """
+    if static:
+        kwargs["static"] = "on"
+    if pure:
+        kwargs["pure"] = "on"
+    _go_binary(
+        name = name,
+        x_defs = x_defs,
+        **kwargs
+    )
+
+def go_importpath(target):
+    """Returns the importpath for the target."""
+    return target[GoLibrary].importpath
+
+def go_library(name, **kwargs):
+    _go_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_test(name, pure = False, library = None, **kwargs):
+    """Build a go test.
+
+    Args:
+        name: name of the output binary.
+        pure: should it be built without cgo.
+        library: the library to embed.
+        **kwargs: rest of the arguments to pass to _go_test.
+    """
+    if pure:
+        kwargs["pure"] = "on"
+    if library:
+        kwargs["embed"] = [library]
+    _go_test(
+        name = name,
+        **kwargs
+    )
+
+def go_rule(rule, implementation, **kwargs):
+    """Wraps a rule definition with Go attributes.
+
+    Args:
+      rule: rule function (typically rule or aspect).
+      implementation: implementation function.
+      **kwargs: other arguments to pass to rule.
+
+    Returns:
+        The result of invoking the rule.
+    """
+    attrs = kwargs.pop("attrs", dict())
+    attrs["_go_context_data"] = attr.label(default = "@io_bazel_rules_go//:go_context_data")
+    attrs["_stdlib"] = attr.label(default = "@io_bazel_rules_go//:stdlib")
+    toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
+    return rule(implementation, attrs = attrs, toolchains = toolchains, **kwargs)
+
+def go_test_library(target):
+    if hasattr(target.attr, "embed") and len(target.attr.embed) > 0:
+        return target.attr.embed[0]
+    return None
+
+def go_context(ctx, goos = None, goarch = None, std = False):
+    """Extracts a standard Go context struct.
+
+    Args:
+      ctx: the starlark context (required).
+      goos: the GOOS value.
+      goarch: the GOARCH value.
+      std: ignored.
+
+    Returns:
+      A context Go struct with pointers to Go toolchain components.
+    """
+
+    # We don't change anything for the standard library analysis. All Go files
+    # are available in all instances. Note that this includes the standard
+    # library sources, which are analyzed by nogo.
+    go_ctx = _go_context(ctx)
+    if goos == None:
+        goos = go_ctx.sdk.goos
+    elif goos != go_ctx.sdk.goos:
+        fail("Internal GOOS (%s) doesn't match GoSdk GOOS (%s)." % (goos, go_ctx.sdk.goos))
+    if goarch == None:
+        goarch = go_ctx.sdk.goarch
+    elif goarch != go_ctx.sdk.goarch:
+        fail("Internal GOARCH (%s) doesn't match GoSdk GOARCH (%s)." % (goarch, go_ctx.sdk.goarch))
+    return struct(
+        go = go_ctx.go,
+        env = go_ctx.env,
+        nogo_args = [],
+        stdlib_srcs = go_ctx.sdk.srcs,
+        runfiles = depset([go_ctx.go] + go_ctx.sdk.srcs + go_ctx.sdk.tools + go_ctx.stdlib.libs),
+        goos = go_ctx.sdk.goos,
+        goarch = go_ctx.sdk.goarch,
+        tags = go_ctx.tags,
+    )
+
+def select_goarch():
+    return select_arch(arm64 = "arm64", amd64 = "amd64")
+
+def select_goos():
+    return select_system(linux = "linux")
diff --git a/tools/bazeldefs/pkg.bzl b/tools/bazeldefs/pkg.bzl
new file mode 100644
index 000000000..56317d93f
--- /dev/null
+++ b/tools/bazeldefs/pkg.bzl
@@ -0,0 +1,6 @@
+"""Packaging rules."""
+
+load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
+
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
diff --git a/tools/bigquery/BUILD b/tools/bigquery/BUILD
index 5748fb390..1cea9e1c9 100644
--- a/tools/bigquery/BUILD
+++ b/tools/bigquery/BUILD
@@ -6,5 +6,11 @@ go_library(
     name = "bigquery",
     testonly = 1,
     srcs = ["bigquery.go"],
-    deps = ["@com_google_cloud_go_bigquery//:go_default_library"],
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "@com_google_cloud_go_bigquery//:go_default_library",
+        "@org_golang_google_api//option:go_default_library",
+    ],
 )
diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go
index 56f0dc5c9..34b270cc0 100644
--- a/tools/bigquery/bigquery.go
+++ b/tools/bigquery/bigquery.go
@@ -25,16 +25,33 @@ import (
 	"time"
 
 	bq "cloud.google.com/go/bigquery"
+	"google.golang.org/api/option"
 )
 
-// Benchmark is the top level structure of recorded benchmark data. BigQuery
+// Suite is the top level structure for a benchmark run. BigQuery
 // will infer the schema from this.
+type Suite struct {
+	Name       string       `bq:"name"`
+	Conditions []*Condition `bq:"conditions"`
+	Benchmarks []*Benchmark `bq:"benchmarks"`
+	Official   bool         `bq:"official"`
+	Timestamp  time.Time    `bq:"timestamp"`
+}
+
+// Benchmark represents an individual benchmark in a suite.
 type Benchmark struct {
-	Name      string    `bq:"name"`
-	Timestamp time.Time `bq:"timestamp"`
-	Official  bool      `bq:"official"`
-	Metric    []*Metric `bq:"metric"`
-	Metadata  *Metadata `bq:"metadata"`
+	Name      string       `bq:"name"`
+	Condition []*Condition `bq:"condition"`
+	Metric    []*Metric    `bq:"metric"`
+}
+
+// Condition represents qualifiers for the benchmark or suite. For example:
+// Get_Pid/1/real_time would have Benchmark Name "Get_Pid" with "1"
+// and "real_time" parameters as conditions. Suite conditions include
+// information such as the CL number and platform name.
+type Condition struct {
+	Name  string `bq:"name"`
+	Value string `bq:"value"`
 }
 
 // Metric holds the actual metric data and unit information for this benchmark.
@@ -44,19 +61,9 @@ type Metric struct {
 	Sample float64 `bq:"sample"`
 }
 
-// Metadata about this benchmark.
-type Metadata struct {
-	CL          string `bq:"changelist"`
-	IterationID string `bq:"iteration_id"`
-	PendingCL   string `bq:"pending_cl"`
-	Workflow    string `bq:"workflow"`
-	Platform    string `bq:"platform"`
-	Gofer       string `bq:"gofer"`
-}
-
 // InitBigQuery initializes a BigQuery dataset/table in the project. If the dataset/table already exists, it is not duplicated.
-func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string) error {
-	client, err := bq.NewClient(ctx, projectID)
+func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string, opts []option.ClientOption) error {
+	client, err := bq.NewClient(ctx, projectID, opts...)
 	if err != nil {
 		return fmt.Errorf("failed to initialize client on project %s: %v", projectID, err)
 	}
@@ -68,7 +75,7 @@ func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string) err
 	}
 
 	table := dataset.Table(tableID)
-	schema, err := bq.InferSchema(Benchmark{})
+	schema, err := bq.InferSchema(Suite{})
 	if err != nil {
 		return fmt.Errorf("failed to infer schema: %v", err)
 	}
@@ -79,6 +86,14 @@ func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string) err
 	return nil
 }
 
+// AddCondition adds a condition to an existing Benchmark.
+func (bm *Benchmark) AddCondition(name, value string) {
+	bm.Condition = append(bm.Condition, &Condition{
+		Name:  name,
+		Value: value,
+	})
+}
+
 // AddMetric adds a metric to an existing Benchmark.
 func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) {
 	m := &Metric{
@@ -90,26 +105,34 @@ func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) {
 }
 
 // NewBenchmark initializes a new benchmark.
-func NewBenchmark(name string, official bool) *Benchmark {
+func NewBenchmark(name string, iters int, official bool) *Benchmark {
 	return &Benchmark{
-		Name:      name,
-		Timestamp: time.Now().UTC(),
-		Official:  official,
-		Metric:    make([]*Metric, 0),
+		Name:   name,
+		Metric: make([]*Metric, 0),
+	}
+}
+
+// NewSuite initializes a new Suite.
+func NewSuite(name string) *Suite {
+	return &Suite{
+		Name:       name,
+		Timestamp:  time.Now().UTC(),
+		Benchmarks: make([]*Benchmark, 0),
+		Conditions: make([]*Condition, 0),
 	}
 }
 
 // SendBenchmarks sends the slice of benchmarks to the BigQuery dataset/table.
-func SendBenchmarks(ctx context.Context, benchmarks []*Benchmark, projectID, datasetID, tableID string) error {
-	client, err := bq.NewClient(ctx, projectID)
+func SendBenchmarks(ctx context.Context, suite *Suite, projectID, datasetID, tableID string, opts []option.ClientOption) error {
+	client, err := bq.NewClient(ctx, projectID, opts...)
 	if err != nil {
-		return fmt.Errorf("Failed to initialize client on project: %s: %v", projectID, err)
+		return fmt.Errorf("failed to initialize client on project: %s: %v", projectID, err)
 	}
 	defer client.Close()
 
 	uploader := client.Dataset(datasetID).Table(tableID).Uploader()
-	if err = uploader.Put(ctx, benchmarks); err != nil {
-		return fmt.Errorf("failed to upload benchmarks to proejct %s, table %s.%s: %v", projectID, datasetID, tableID, err)
+	if err = uploader.Put(ctx, suite); err != nil {
+		return fmt.Errorf("failed to upload benchmarks %s to project %s, table %s.%s: %v", suite.Name, projectID, datasetID, tableID, err)
 	}
 
 	return nil
diff --git a/tools/checkescape/BUILD b/tools/checkescape/BUILD
index b8c3ddf44..8956be621 100644
--- a/tools/checkescape/BUILD
+++ b/tools/checkescape/BUILD
@@ -8,7 +8,6 @@ go_library(
     nogo = False,
     visibility = ["//tools/nogo:__subpackages__"],
     deps = [
-        "//tools/nogo/data",
         "@org_golang_x_tools//go/analysis:go_tool_library",
         "@org_golang_x_tools//go/analysis/passes/buildssa:go_tool_library",
         "@org_golang_x_tools//go/ssa:go_tool_library",
diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go
index f8def4823..e5a7e23c7 100644
--- a/tools/checkescape/checkescape.go
+++ b/tools/checkescape/checkescape.go
@@ -61,20 +61,21 @@ package checkescape
 import (
 	"bufio"
 	"bytes"
+	"flag"
 	"fmt"
 	"go/ast"
 	"go/token"
 	"go/types"
 	"io"
+	"log"
 	"os"
+	"os/exec"
 	"path/filepath"
-	"strconv"
 	"strings"
 
 	"golang.org/x/tools/go/analysis"
 	"golang.org/x/tools/go/analysis/passes/buildssa"
 	"golang.org/x/tools/go/ssa"
-	"gvisor.dev/gvisor/tools/nogo/data"
 )
 
 const (
@@ -91,81 +92,20 @@ const (
 	exempt = "// escapes"
 )
 
-// escapingBuiltins are builtins known to escape.
-//
-// These are lowered at an earlier stage of compilation to explicit function
-// calls, but are not available for recursive analysis.
-var escapingBuiltins = []string{
-	"append",
-	"makemap",
-	"newobject",
-	"mallocgc",
-}
-
-// Analyzer defines the entrypoint.
-var Analyzer = &analysis.Analyzer{
-	Name:      "checkescape",
-	Doc:       "surfaces recursive escape analysis results",
-	Run:       run,
-	Requires:  []*analysis.Analyzer{buildssa.Analyzer},
-	FactTypes: []analysis.Fact{(*packageEscapeFacts)(nil)},
-}
-
-// packageEscapeFacts is the set of all functions in a package, and whether or
-// not they recursively pass escape analysis.
-//
-// All the type names for receivers are encoded in the full key. The key
-// represents the fully qualified package and type name used at link time.
-type packageEscapeFacts struct {
-	Funcs map[string][]Escape
-}
-
-// AFact implements analysis.Fact.AFact.
-func (*packageEscapeFacts) AFact() {}
-
-// CallSite is a single call site.
-//
-// These can be chained.
-type CallSite struct {
-	LocalPos token.Pos
-	Resolved LinePosition
-}
-
-// Escape is a single escape instance.
-type Escape struct {
-	Reason EscapeReason
-	Detail string
-	Chain  []CallSite
-}
-
-// LinePosition is a low-resolution token.Position.
-//
-// This is used to match against possible exemptions placed in the source.
-type LinePosition struct {
-	Filename string
-	Line     int
-}
+var (
+	// Binary is the binary under analysis.
+	//
+	// See Reader, below.
+	binary = flag.String("binary", "", "binary under analysis")
 
-// String implements fmt.Stringer.String.
-func (e *LinePosition) String() string {
-	return fmt.Sprintf("%s:%d", e.Filename, e.Line)
-}
+	// Reader is the input stream.
+	//
+	// This may be set instead of Binary.
+	Reader io.Reader
 
-// String implements fmt.Stringer.String.
-//
-// Note that this string will contain new lines.
-func (e *Escape) String() string {
-	var b bytes.Buffer
-	fmt.Fprintf(&b, "%s", e.Reason.String())
-	for i, cs := range e.Chain {
-		if i == len(e.Chain)-1 {
-			fmt.Fprintf(&b, "\n @ %s → %s", cs.Resolved.String(), e.Detail)
-		} else {
-			fmt.Fprintf(&b, "\n + %s", cs.Resolved.String())
-		}
-	}
-	return b.String()
-}
+	// objdumpTool is the tool used to dump a binary.
+	objdumpTool = flag.String("objdump_tool", "", "tool used to dump a binary")
+)
 
 // EscapeReason is an escape reason.
 //
@@ -173,12 +113,12 @@ func (e *Escape) String() string {
 type EscapeReason int
 
 const (
-	interfaceInvoke EscapeReason = iota
-	unknownPackage
-	allocation
+	allocation EscapeReason = iota
 	builtin
+	interfaceInvoke
 	dynamicCall
 	stackSplit
+	unknownPackage
 	reasonCount // Count for below.
 )
 
@@ -189,17 +129,17 @@ const (
 func (e EscapeReason) String() string {
 	switch e {
 	case interfaceInvoke:
-		return "interface: function invocation via interface"
+		return "interface: call to potentially allocating function"
 	case unknownPackage:
 		return "unknown: no package information available"
 	case allocation:
-		return "heap: call to runtime heap allocation"
+		return "heap: explicit allocation"
 	case builtin:
-		return "builtin: call to runtime builtin"
+		return "builtin: call to potentially allocating builtin"
 	case dynamicCall:
-		return "dynamic: call via dynamic function"
+		return "dynamic: call to potentially allocating function"
 	case stackSplit:
-		return "stack: stack split on function entry"
+		return "stack: possible split on function entry"
 	default:
 		panic(fmt.Sprintf("unknown reason: %d", e))
 	}
@@ -228,52 +168,289 @@ var escapeTypes = func() map[string]EscapeReason {
 	return result
 }()
 
-// EscapeCount counts escapes.
+// escapingBuiltins are builtins known to escape.
+//
+// These are lowered at an earlier stage of compilation to explicit function
+// calls, but are not available for recursive analysis.
+var escapingBuiltins = []string{
+	"append",
+	"makemap",
+	"newobject",
+	"mallocgc",
+}
+
+// packageEscapeFacts is the set of all functions in a package, and whether or
+// not they recursively pass escape analysis.
+//
+// All the type names for receivers are encoded in the full key. The key
+// represents the fully qualified package and type name used at link time.
 //
-// It is used to avoid accumulating too many escapes for the same reason, for
-// the same function. We limit each class to 3 instances (arbitrarily).
-type EscapeCount struct {
-	byReason [reasonCount]uint32
+// Note that each Escapes object is a summary. Local findings may be reported
+// using more detailed information.
+type packageEscapeFacts struct {
+	Funcs map[string]Escapes
+}
+
+// AFact implements analysis.Fact.AFact.
+func (*packageEscapeFacts) AFact() {}
+
+// Analyzer includes specific results.
+var Analyzer = &analysis.Analyzer{
+	Name:      "checkescape",
+	Doc:       "escape analysis checks based on +checkescape annotations",
+	Run:       runSelectEscapes,
+	Requires:  []*analysis.Analyzer{buildssa.Analyzer},
+	FactTypes: []analysis.Fact{(*packageEscapeFacts)(nil)},
+}
+
+// EscapeAnalyzer includes all local escape results.
+var EscapeAnalyzer = &analysis.Analyzer{
+	Name:     "checkescape",
+	Doc:      "complete local escape analysis results (requires Analyzer facts)",
+	Run:      runAllEscapes,
+	Requires: []*analysis.Analyzer{buildssa.Analyzer},
 }
 
-// maxRecordsPerReason is the number of explicit records.
+// LinePosition is a low-resolution token.Position.
 //
-// See EscapeCount (and usage), and Record implementation.
-const maxRecordsPerReason = 5
-
-// Record records the reason or returns false if it should not be added.
-func (ec *EscapeCount) Record(reason EscapeReason) bool {
-	ec.byReason[reason]++
-	if ec.byReason[reason] > maxRecordsPerReason {
-		return false
+// This is used to match against possible exemptions placed in the source.
+type LinePosition struct {
+	Filename string
+	Line     int
+}
+
+// String implements fmt.Stringer.String.
+func (e LinePosition) String() string {
+	return fmt.Sprintf("%s:%d", e.Filename, e.Line)
+}
+
+// Simplified returns the simplified name.
+func (e LinePosition) Simplified() string {
+	return fmt.Sprintf("%s:%d", filepath.Base(e.Filename), e.Line)
+}
+
+// CallSite is a single call site.
+//
+// These can be chained.
+type CallSite struct {
+	LocalPos token.Pos
+	Resolved LinePosition
+}
+
+// IsValid indicates whether the CallSite is valid or not.
+func (cs *CallSite) IsValid() bool {
+	return cs.LocalPos.IsValid()
+}
+
+// Escapes is a collection of escapes.
+//
+// We record at most one escape for each reason, but record the number of
+// escapes that were omitted.
+//
+// This object should be used to summarize all escapes for a single line (local
+// analysis) or a single function (package facts).
+//
+// All fields are exported for gob.
+type Escapes struct {
+	CallSites [reasonCount][]CallSite
+	Details   [reasonCount]string
+	Omitted   [reasonCount]int
+}
+
+// add is called by Add and Merge.
+func (es *Escapes) add(r EscapeReason, detail string, omitted int, callSites ...CallSite) {
+	if es.CallSites[r] != nil {
+		// We will either be replacing the current escape or dropping
+		// the added one. Either way, we increment omitted by the
+		// appropriate amount.
+		es.Omitted[r]++
+		// If the callSites in the other is only a single element, then
+		// we will universally favor this. This provides the cleanest
+		// set of escapes to summarize, and more importantly: if there
+		if len(es.CallSites) == 1 || len(callSites) != 1 {
+			return
+		}
+	}
+	es.Details[r] = detail
+	es.CallSites[r] = callSites
+	es.Omitted[r] += omitted
+}
+
+// Add adds a single escape.
+func (es *Escapes) Add(r EscapeReason, detail string, callSites ...CallSite) {
+	es.add(r, detail, 0, callSites...)
+}
+
+// IsEmpty returns true iff this Escapes is empty.
+func (es *Escapes) IsEmpty() bool {
+	for _, cs := range es.CallSites {
+		if cs != nil {
+			return false
+		}
 	}
 	return true
 }
 
+// Filter filters out all escapes except those matches the given reasons.
+//
+// If local is set, then non-local escapes will also be filtered.
+func (es *Escapes) Filter(reasons []EscapeReason, local bool) {
+FilterReasons:
+	for r := EscapeReason(0); r < reasonCount; r++ {
+		for i := 0; i < len(reasons); i++ {
+			if r == reasons[i] {
+				continue FilterReasons
+			}
+		}
+		// Zap this reason.
+		es.CallSites[r] = nil
+		es.Details[r] = ""
+		es.Omitted[r] = 0
+	}
+	if !local {
+		return
+	}
+	for r := EscapeReason(0); r < reasonCount; r++ {
+		// Is does meet our local requirement?
+		if len(es.CallSites[r]) > 1 {
+			es.CallSites[r] = nil
+			es.Details[r] = ""
+			es.Omitted[r] = 0
+		}
+	}
+}
+
+// MergeWithCall merges these escapes with another.
+//
+// If callSite is nil, no call is added.
+func (es *Escapes) MergeWithCall(other Escapes, callSite CallSite) {
+	for r := EscapeReason(0); r < reasonCount; r++ {
+		if other.CallSites[r] != nil {
+			// Construct our new call chain.
+			newCallSites := other.CallSites[r]
+			if callSite.IsValid() {
+				newCallSites = append([]CallSite{callSite}, newCallSites...)
+			}
+			// Add (potentially replacing) the underlying escape.
+			es.add(r, other.Details[r], other.Omitted[r], newCallSites...)
+		}
+	}
+}
+
+// Reportf will call Reportf for each class of escapes.
+func (es *Escapes) Reportf(pass *analysis.Pass) {
+	var b bytes.Buffer // Reused for all escapes.
+	for r := EscapeReason(0); r < reasonCount; r++ {
+		if es.CallSites[r] == nil {
+			continue
+		}
+		b.Reset()
+		fmt.Fprintf(&b, "%s ", r.String())
+		if es.Omitted[r] > 0 {
+			fmt.Fprintf(&b, "(%d omitted) ", es.Omitted[r])
+		}
+		for _, cs := range es.CallSites[r][1:] {
+			fmt.Fprintf(&b, "→ %s ", cs.Resolved.String())
+		}
+		fmt.Fprintf(&b, "→ %s", es.Details[r])
+		pass.Reportf(es.CallSites[r][0].LocalPos, b.String())
+	}
+}
+
+// MergeAll merges a sequence of escapes.
+func MergeAll(others []Escapes) (es Escapes) {
+	for _, other := range others {
+		es.MergeWithCall(other, CallSite{})
+	}
+	return
+}
+
 // loadObjdump reads the objdump output.
 //
 // This records if there is a call any function for every source line. It is
 // used only to remove false positives for escape analysis. The call will be
 // elided if escape analysis is able to put the object on the heap exclusively.
-func loadObjdump() (map[LinePosition]string, error) {
-	f, err := os.Open(data.Objdump)
+//
+// Note that the map uses <basename.go>:<line> because that is all that is
+// provided in the objdump format. Since this is all local, it is sufficient.
+func loadObjdump() (map[string][]string, error) {
+	var (
+		args  []string
+		stdin io.Reader
+	)
+	if *binary != "" {
+		args = append(args, *binary)
+	} else if Reader != nil {
+		stdin = Reader
+	} else {
+		// We have no input stream or binary.
+		return nil, fmt.Errorf("no binary or reader provided")
+	}
+
+	// Construct our command.
+	cmd := exec.Command(*objdumpTool, args...)
+	cmd.Stdin = stdin
+	cmd.Stderr = os.Stderr
+	out, err := cmd.StdoutPipe()
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()
+	if err := cmd.Start(); err != nil {
+		return nil, err
+	}
+
+	// Identify calls by address or name. Note that this is also
+	// constructed dynamically below, as we encounted the addresses.
+	// This is because some of the functions (duffzero) may have
+	// jump targets in the middle of the function itself.
+	funcsAllowed := map[string]struct{}{
+		"runtime.duffzero":       struct{}{},
+		"runtime.duffcopy":       struct{}{},
+		"runtime.racefuncenter":  struct{}{},
+		"runtime.gcWriteBarrier": struct{}{},
+		"runtime.retpolineAX":    struct{}{},
+		"runtime.retpolineBP":    struct{}{},
+		"runtime.retpolineBX":    struct{}{},
+		"runtime.retpolineCX":    struct{}{},
+		"runtime.retpolineDI":    struct{}{},
+		"runtime.retpolineDX":    struct{}{},
+		"runtime.retpolineR10":   struct{}{},
+		"runtime.retpolineR11":   struct{}{},
+		"runtime.retpolineR12":   struct{}{},
+		"runtime.retpolineR13":   struct{}{},
+		"runtime.retpolineR14":   struct{}{},
+		"runtime.retpolineR15":   struct{}{},
+		"runtime.retpolineR8":    struct{}{},
+		"runtime.retpolineR9":    struct{}{},
+		"runtime.retpolineSI":    struct{}{},
+		"runtime.stackcheck":     struct{}{},
+		"runtime.settls":         struct{}{},
+	}
+	addrsAllowed := make(map[string]struct{})
 
 	// Build the map.
-	m := make(map[LinePosition]string)
-	r := bufio.NewReader(f)
-	var (
-		lastField string
-		lastPos   LinePosition
-	)
+	nextFunc := "" // For funcsAllowed.
+	m := make(map[string][]string)
+	r := bufio.NewReader(out)
+NextLine:
 	for {
 		line, err := r.ReadString('\n')
 		if err != nil && err != io.EOF {
 			return nil, err
 		}
+		fields := strings.Fields(line)
+
+		// Is this an "allowed" function definition?
+		if len(fields) >= 2 && fields[0] == "TEXT" {
+			nextFunc = strings.TrimSuffix(fields[1], "(SB)")
+			if _, ok := funcsAllowed[nextFunc]; !ok {
+				nextFunc = "" // Don't record addresses.
+			}
+		}
+		if nextFunc != "" && len(fields) > 2 {
+			// Save the given address (in hex form, as it appears).
+			addrsAllowed[fields[1]] = struct{}{}
+		}
 
 		// We recognize lines corresponding to actual code (not the
 		// symbol name or other metadata) and annotate them if they
@@ -283,53 +460,70 @@ func loadObjdump() (map[LinePosition]string, error) {
 		//
 		// Lines look like this (including the first space):
 		//  gohacks_unsafe.go:33  0xa39                   488b442408              MOVQ 0x8(SP), AX
-		if len(line) > 0 && line[0] == ' ' {
-			fields := strings.Fields(line)
+		if len(fields) >= 5 && line[0] == ' ' {
 			if !strings.Contains(fields[3], "CALL") {
 				continue
 			}
+			site := fields[0]
+			target := strings.TrimSuffix(fields[4], "(SB)")
 
-			// Ignore strings containing duffzero, which is just
-			// used by stack allocations for types that are large
-			// enough to warrant Duff's device.
-			if strings.Contains(line, "runtime.duffzero") {
+			// Ignore strings containing allowed functions.
+			if _, ok := funcsAllowed[target]; ok {
 				continue
 			}
-
-			// Ignore the racefuncenter call, which is used for
-			// race builds. This does not escape.
-			if strings.Contains(line, "runtime.racefuncenter") {
+			if _, ok := addrsAllowed[target]; ok {
 				continue
 			}
-
-			// Calculate the filename and line. Note that per the
-			// example above, the filename is not a fully qualified
-			// base, just the basename (what we require).
-			if fields[0] != lastField {
-				parts := strings.SplitN(fields[0], ":", 2)
-				lineNum, err := strconv.ParseInt(parts[1], 10, 64)
-				if err != nil {
-					return nil, err
-				}
-				lastPos = LinePosition{
-					Filename: parts[0],
-					Line:     int(lineNum),
+			if len(fields) > 5 {
+				// This may be a future relocation. Some
+				// objdump versions describe this differently.
+				// If it contains any of the functions allowed
+				// above as a string, we let it go.
+				softTarget := strings.Join(fields[5:], " ")
+				for name := range funcsAllowed {
+					if strings.Contains(softTarget, name) {
+						continue NextLine
+					}
 				}
-				lastField = fields[0]
-			}
-			if _, ok := m[lastPos]; ok {
-				continue // Already marked.
 			}
 
-			// Save the actual call for the detail.
-			m[lastPos] = strings.Join(fields[3:], " ")
+			// Does this exist already?
+			existing, ok := m[site]
+			if !ok {
+				existing = make([]string, 0, 1)
+			}
+			for _, other := range existing {
+				if target == other {
+					continue NextLine
+				}
+			}
+			existing = append(existing, target)
+			m[site] = existing // Update.
 		}
 		if err == io.EOF {
 			break
 		}
 	}
 
-	return m, nil
+	// Zap any accidental false positives.
+	final := make(map[string][]string)
+	for site, calls := range m {
+		filteredCalls := make([]string, 0, len(calls))
+		for _, call := range calls {
+			if _, ok := addrsAllowed[call]; ok {
+				continue // Omit this call.
+			}
+			filteredCalls = append(filteredCalls, call)
+		}
+		final[site] = filteredCalls
+	}
+
+	// Wait for the dump to finish.
+	if err := cmd.Wait(); err != nil {
+		return nil, err
+	}
+
+	return final, nil
 }
 
 // poser is a type that implements Pos.
@@ -337,65 +531,156 @@ type poser interface {
 	Pos() token.Pos
 }
 
+// runSelectEscapes runs with only select escapes.
+func runSelectEscapes(pass *analysis.Pass) (interface{}, error) {
+	return run(pass, false)
+}
+
+// runAllEscapes runs with all escapes included.
+func runAllEscapes(pass *analysis.Pass) (interface{}, error) {
+	return run(pass, true)
+}
+
+// findReasons extracts reasons from the function.
+func findReasons(pass *analysis.Pass, fdecl *ast.FuncDecl) ([]EscapeReason, bool, map[EscapeReason]bool) {
+	// Is there a comment?
+	if fdecl.Doc == nil {
+		return nil, false, nil
+	}
+	var (
+		reasons     []EscapeReason
+		local       bool
+		testReasons = make(map[EscapeReason]bool) // reason -> local?
+	)
+	// Scan all lines.
+	found := false
+	for _, c := range fdecl.Doc.List {
+		// Does the comment contain a +checkescape line?
+		if !strings.HasPrefix(c.Text, magic) && !strings.HasPrefix(c.Text, testMagic) {
+			continue
+		}
+		if c.Text == magic {
+			// Default: hard reasons, local only.
+			reasons = hardReasons
+			local = true
+		} else if strings.HasPrefix(c.Text, magicParams) {
+			// Extract specific reasons.
+			types := strings.Split(c.Text[len(magicParams):], ",")
+			found = true // For below.
+			for i := 0; i < len(types); i++ {
+				if types[i] == "local" {
+					// Limit search to local escapes.
+					local = true
+				} else if types[i] == "all" {
+					// Append all reasons.
+					reasons = append(reasons, allReasons...)
+				} else if types[i] == "hard" {
+					// Append all hard reasons.
+					reasons = append(reasons, hardReasons...)
+				} else {
+					r, ok := escapeTypes[types[i]]
+					if !ok {
+						// This is not a valid escape reason.
+						pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+						continue
+					}
+					reasons = append(reasons, r)
+				}
+			}
+		} else if strings.HasPrefix(c.Text, testMagic) {
+			types := strings.Split(c.Text[len(testMagic):], ",")
+			local := false
+			for i := 0; i < len(types); i++ {
+				if types[i] == "local" {
+					local = true
+				} else {
+					r, ok := escapeTypes[types[i]]
+					if !ok {
+						// This is not a valid escape reason.
+						pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+						continue
+					}
+					if v, ok := testReasons[r]; ok && v {
+						// Already registered as local.
+						continue
+					}
+					testReasons[r] = local
+				}
+			}
+		}
+	}
+	if len(reasons) == 0 && found {
+		// A magic annotation was provided, but no reasons.
+		pass.Reportf(fdecl.Pos(), "no reasons provided")
+	}
+	return reasons, local, testReasons
+}
+
 // run performs the analysis.
-func run(pass *analysis.Pass) (interface{}, error) {
+func run(pass *analysis.Pass, localEscapes bool) (interface{}, error) {
 	calls, err := loadObjdump()
 	if err != nil {
-		return nil, err
-	}
-	pef := packageEscapeFacts{
-		Funcs: make(map[string][]Escape),
+		// Note that if this analysis fails, then we don't actually
+		// fail the analyzer itself. We simply report every possible
+		// escape. In most cases this will work just fine.
+		log.Printf("WARNING: unable to load objdump: %v", err)
 	}
+	allEscapes := make(map[string][]Escapes)
+	mergedEscapes := make(map[string]Escapes)
 	linePosition := func(inst, parent poser) LinePosition {
 		p := pass.Fset.Position(inst.Pos())
 		if (p.Filename == "" || p.Line == 0) && parent != nil {
 			p = pass.Fset.Position(parent.Pos())
 		}
 		return LinePosition{
-			Filename: filepath.Base(p.Filename),
+			Filename: p.Filename,
 			Line:     p.Line,
 		}
 	}
-	hasCall := func(inst poser) (string, bool) {
-		p := linePosition(inst, nil)
-		s, ok := calls[p]
-		return s, ok
-	}
 	callSite := func(inst ssa.Instruction) CallSite {
 		return CallSite{
 			LocalPos: inst.Pos(),
 			Resolved: linePosition(inst, inst.Parent()),
 		}
 	}
-	escapes := func(reason EscapeReason, detail string, inst ssa.Instruction, ec *EscapeCount) []Escape {
-		if !ec.Record(reason) {
-			return nil // Skip.
+	hasCall := func(inst poser) (string, bool) {
+		p := linePosition(inst, nil)
+		if calls == nil {
+			// See above: we don't have access to the binary
+			// itself, so need to include every possible call.
+			return "(possible)", true
 		}
-		es := Escape{
-			Reason: reason,
-			Detail: detail,
-			Chain:  []CallSite{callSite(inst)},
+		s, ok := calls[p.Simplified()]
+		if !ok {
+			return "", false
 		}
-		return []Escape{es}
+		// Join all calls together.
+		return strings.Join(s, " or "), true
 	}
-	resolve := func(sub []Escape, inst ssa.Instruction, ec *EscapeCount) (es []Escape) {
-		for _, e := range sub {
-			if !ec.Record(e.Reason) {
-				continue // Skip.
+	state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA)
+
+	// Build the exception list.
+	exemptions := make(map[LinePosition]string)
+	for _, f := range pass.Files {
+		for _, cg := range f.Comments {
+			for _, c := range cg.List {
+				p := pass.Fset.Position(c.Slash)
+				if strings.HasPrefix(strings.ToLower(c.Text), exempt) {
+					exemptions[LinePosition{
+						Filename: p.Filename,
+						Line:     p.Line,
+					}] = c.Text[len(exempt):]
+				}
 			}
-			es = append(es, Escape{
-				Reason: e.Reason,
-				Detail: e.Detail,
-				Chain:  append([]CallSite{callSite(inst)}, e.Chain...),
-			})
 		}
-		return es
 	}
-	state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA)
 
-	var loadFunc func(*ssa.Function) []Escape // Used below.
-
-	analyzeInstruction := func(inst ssa.Instruction, ec *EscapeCount) []Escape {
+	var loadFunc func(*ssa.Function) Escapes // Used below.
+	analyzeInstruction := func(inst ssa.Instruction) (es Escapes) {
+		cs := callSite(inst)
+		if _, ok := exemptions[cs.Resolved]; ok {
+			return // No escape.
+		}
 		switch x := inst.(type) {
 		case *ssa.Call:
 			if x.Call.IsInvoke() {
@@ -404,19 +689,15 @@ func run(pass *analysis.Pass) (interface{}, error) {
 				// not, since we don't know the underlying
 				// type.
 				call, _ := hasCall(inst)
-				return escapes(interfaceInvoke, call, inst, ec)
+				es.Add(interfaceInvoke, call, cs)
+				return
 			}
 			switch x := x.Call.Value.(type) {
 			case *ssa.Function:
 				if x.Pkg == nil {
 					// Can't resolve the package.
-					return escapes(unknownPackage, "no package", inst, ec)
-				}
-
-				// Atomic functions are instrinics. We can
-				// assume that they don't escape.
-				if x.Pkg.Pkg.Name() == "atomic" {
-					return nil
+					es.Add(unknownPackage, "no package", cs)
+					return
 				}
 
 				// Is this a local function? If yes, call the
@@ -424,7 +705,8 @@ func run(pass *analysis.Pass) (interface{}, error) {
 				// local escapes are the escapes found in the
 				// local function.
 				if x.Pkg.Pkg == pass.Pkg {
-					return resolve(loadFunc(x), inst, ec)
+					es.MergeWithCall(loadFunc(x), cs)
+					return
 				}
 
 				// Recursively collect information from
@@ -433,22 +715,26 @@ func run(pass *analysis.Pass) (interface{}, error) {
 				if !pass.ImportPackageFact(x.Pkg.Pkg, &imp) {
 					// Unable to import the dependency; we must
 					// declare these as escaping.
-					return escapes(unknownPackage, "no analysis", inst, ec)
+					es.Add(unknownPackage, "no analysis", cs)
+					return
 				}
 
 				// The escapes of this instruction are the
 				// escapes of the called function directly.
-				return resolve(imp.Funcs[x.RelString(x.Pkg.Pkg)], inst, ec)
+				// Note that this may record many escapes.
+				es.MergeWithCall(imp.Funcs[x.RelString(x.Pkg.Pkg)], cs)
+				return
 			case *ssa.Builtin:
 				// Ignore elided escapes.
 				if _, has := hasCall(inst); !has {
-					return nil
+					return
 				}
 
 				// Check if the builtin is escaping.
 				for _, name := range escapingBuiltins {
 					if x.Name() == name {
-						return escapes(builtin, name, inst, ec)
+						es.Add(builtin, name, cs)
+						return
 					}
 				}
 			default:
@@ -457,82 +743,87 @@ func run(pass *analysis.Pass) (interface{}, error) {
 				// dispatches. We cannot actually look up what
 				// this refers to using static analysis alone.
 				call, _ := hasCall(inst)
-				return escapes(dynamicCall, call, inst, ec)
+				es.Add(dynamicCall, call, cs)
 			}
 		case *ssa.Alloc:
 			// Ignore non-heap allocations.
 			if !x.Heap {
-				return nil
+				return
 			}
 
 			// Ignore elided escapes.
 			call, has := hasCall(inst)
 			if !has {
-				return nil
+				return
 			}
 
 			// This is a real heap allocation.
-			return escapes(allocation, call, inst, ec)
+			es.Add(allocation, call, cs)
 		case *ssa.MakeMap:
-			return escapes(builtin, "makemap", inst, ec)
+			es.Add(builtin, "makemap", cs)
 		case *ssa.MakeSlice:
-			return escapes(builtin, "makeslice", inst, ec)
+			es.Add(builtin, "makeslice", cs)
 		case *ssa.MakeClosure:
-			return escapes(builtin, "makeclosure", inst, ec)
+			es.Add(builtin, "makeclosure", cs)
 		case *ssa.MakeChan:
-			return escapes(builtin, "makechan", inst, ec)
+			es.Add(builtin, "makechan", cs)
 		}
-		return nil // No escapes.
+		return
 	}
 
-	var analyzeBasicBlock func(*ssa.BasicBlock, *EscapeCount) []Escape // Recursive.
-	analyzeBasicBlock = func(block *ssa.BasicBlock, ec *EscapeCount) (rval []Escape) {
+	var analyzeBasicBlock func(*ssa.BasicBlock) []Escapes // Recursive.
+	analyzeBasicBlock = func(block *ssa.BasicBlock) (rval []Escapes) {
 		for _, inst := range block.Instrs {
-			rval = append(rval, analyzeInstruction(inst, ec)...)
+			if es := analyzeInstruction(inst); !es.IsEmpty() {
+				rval = append(rval, es)
+			}
 		}
-		return rval // N.B. may be empty.
+		return
 	}
 
-	loadFunc = func(fn *ssa.Function) []Escape {
+	loadFunc = func(fn *ssa.Function) Escapes {
 		// Is this already available?
 		name := fn.RelString(pass.Pkg)
-		if es, ok := pef.Funcs[name]; ok {
+		if es, ok := mergedEscapes[name]; ok {
 			return es
 		}
 
 		// In the case of a true cycle, we assume that the current
-		// function itself has no escapes until the rest of the
-		// analysis is complete. This will trip the above in the case
-		// of a cycle of any kind.
-		pef.Funcs[name] = nil
+		// function itself has no escapes.
+		//
+		// When evaluating the function again, the proper escapes will
+		// be filled in here.
+		allEscapes[name] = nil
+		mergedEscapes[name] = Escapes{}
 
 		// Perform the basic analysis.
-		var (
-			es []Escape
-			ec EscapeCount
-		)
+		var es []Escapes
 		if fn.Recover != nil {
-			es = append(es, analyzeBasicBlock(fn.Recover, &ec)...)
+			es = append(es, analyzeBasicBlock(fn.Recover)...)
 		}
 		for _, block := range fn.Blocks {
-			es = append(es, analyzeBasicBlock(block, &ec)...)
+			es = append(es, analyzeBasicBlock(block)...)
 		}
 
 		// Check for a stack split.
 		if call, has := hasCall(fn); has {
-			es = append(es, Escape{
-				Reason: stackSplit,
-				Detail: call,
-				Chain: []CallSite{CallSite{
-					LocalPos: fn.Pos(),
-					Resolved: linePosition(fn, fn.Parent()),
-				}},
+			var ss Escapes
+			ss.Add(stackSplit, call, CallSite{
+				LocalPos: fn.Pos(),
+				Resolved: linePosition(fn, fn.Parent()),
 			})
+			es = append(es, ss)
 		}
 
 		// Save the result and return.
-		pef.Funcs[name] = es
-		return es
+		//
+		// Note that we merge the result when saving to the facts. It
+		// doesn't really matter the specific escapes, as long as we
+		// have recorded all the appropriate classes of escapes.
+		summary := MergeAll(es)
+		allEscapes[name] = es
+		mergedEscapes[name] = summary
+		return summary
 	}
 
 	// Complete all local functions.
@@ -540,173 +831,76 @@ func run(pass *analysis.Pass) (interface{}, error) {
 		loadFunc(fn)
 	}
 
-	// Build the exception list.
-	exemptions := make(map[LinePosition]string)
-	for _, f := range pass.Files {
-		for _, cg := range f.Comments {
-			for _, c := range cg.List {
-				p := pass.Fset.Position(c.Slash)
-				if strings.HasPrefix(strings.ToLower(c.Text), exempt) {
-					exemptions[LinePosition{
-						Filename: filepath.Base(p.Filename),
-						Line:     p.Line,
-					}] = c.Text[len(exempt):]
-				}
-			}
-		}
+	if !localEscapes {
+		// Export all findings for future packages. We only do this in
+		// non-local escapes mode, and expect to run this analysis
+		// after the SelectAnalysis.
+		pass.ExportPackageFact(&packageEscapeFacts{
+			Funcs: mergedEscapes,
+		})
 	}
 
-	// Delete everything matching the excemtions.
-	//
-	// This has the implication that exceptions are applied recursively,
-	// since this now modified set is what will be saved.
-	for name, escapes := range pef.Funcs {
-		var newEscapes []Escape
-		for _, escape := range escapes {
-			isExempt := false
-			for line, _ := range exemptions {
-				// Note that an exemption applies if it is
-				// marked as an exemption anywhere in the call
-				// chain. It need not be marked as escapes in
-				// the function itself, nor in the top-level
-				// caller.
-				for _, callSite := range escape.Chain {
-					if callSite.Resolved == line {
-						isExempt = true
-						break
-					}
-				}
-				if isExempt {
-					break
-				}
-			}
-			if !isExempt {
-				// Record this escape; not an exception.
-				newEscapes = append(newEscapes, escape)
-			}
-		}
-		pef.Funcs[name] = newEscapes // Update.
-	}
-
-	// Export all findings for future packages.
-	pass.ExportPackageFact(&pef)
-
 	// Scan all functions for violations.
 	for _, f := range pass.Files {
 		// Scan all declarations.
 		for _, decl := range f.Decls {
-			fdecl, ok := decl.(*ast.FuncDecl)
 			// Function declaration?
+			fdecl, ok := decl.(*ast.FuncDecl)
 			if !ok {
 				continue
 			}
-			// Is there a comment?
-			if fdecl.Doc == nil {
-				continue
-			}
 			var (
 				reasons     []EscapeReason
-				found       bool
 				local       bool
-				testReasons = make(map[EscapeReason]bool) // reason -> local?
+				testReasons map[EscapeReason]bool
 			)
-			// Does the comment contain a +checkescape line?
-			for _, c := range fdecl.Doc.List {
-				if !strings.HasPrefix(c.Text, magic) && !strings.HasPrefix(c.Text, testMagic) {
-					continue
-				}
-				if c.Text == magic {
-					// Default: hard reasons, local only.
-					reasons = hardReasons
-					local = true
-				} else if strings.HasPrefix(c.Text, magicParams) {
-					// Extract specific reasons.
-					types := strings.Split(c.Text[len(magicParams):], ",")
-					found = true // For below.
-					for i := 0; i < len(types); i++ {
-						if types[i] == "local" {
-							// Limit search to local escapes.
-							local = true
-						} else if types[i] == "all" {
-							// Append all reasons.
-							reasons = append(reasons, allReasons...)
-						} else if types[i] == "hard" {
-							// Append all hard reasons.
-							reasons = append(reasons, hardReasons...)
-						} else {
-							r, ok := escapeTypes[types[i]]
-							if !ok {
-								// This is not a valid escape reason.
-								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
-								continue
-							}
-							reasons = append(reasons, r)
-						}
-					}
-				} else if strings.HasPrefix(c.Text, testMagic) {
-					types := strings.Split(c.Text[len(testMagic):], ",")
-					local := false
-					for i := 0; i < len(types); i++ {
-						if types[i] == "local" {
-							local = true
-						} else {
-							r, ok := escapeTypes[types[i]]
-							if !ok {
-								// This is not a valid escape reason.
-								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
-								continue
-							}
-							if v, ok := testReasons[r]; ok && v {
-								// Already registered as local.
-								continue
-							}
-							testReasons[r] = local
-						}
-					}
-				}
-			}
-			if len(reasons) == 0 && found {
-				// A magic annotation was provided, but no reasons.
-				pass.Reportf(fdecl.Pos(), "no reasons provided")
-				continue
+			if localEscapes {
+				// Find all hard escapes.
+				reasons = hardReasons
+			} else {
+				// Find all declared reasons.
+				reasons, local, testReasons = findReasons(pass, fdecl)
 			}
 
 			// Scan for matches.
 			fn := pass.TypesInfo.Defs[fdecl.Name].(*types.Func)
-			name := state.Pkg.Prog.FuncValue(fn).RelString(pass.Pkg)
-			es, ok := pef.Funcs[name]
-			if !ok {
+			fv := state.Pkg.Prog.FuncValue(fn)
+			if fv == nil {
+				continue
+			}
+			name := fv.RelString(pass.Pkg)
+			all, allOk := allEscapes[name]
+			merged, mergedOk := mergedEscapes[name]
+			if !allOk || !mergedOk {
 				pass.Reportf(fdecl.Pos(), "internal error: function %s not found.", name)
 				continue
 			}
-			for _, e := range es {
-				for _, r := range reasons {
-					// Is does meet our local requirement?
-					if local && len(e.Chain) > 1 {
-						continue
-					}
-					// Does this match the reason? Emit
-					// with a full stack trace that
-					// explains why this violates our
-					// constraints.
-					if e.Reason == r {
-						pass.Reportf(e.Chain[0].LocalPos, "%s", e.String())
-					}
-				}
+
+			// Filter reasons and report.
+			//
+			// For the findings, we use all escapes.
+			for _, es := range all {
+				es.Filter(reasons, local)
+				es.Reportf(pass)
 			}
 
 			// Scan for test (required) matches.
+			//
+			// For tests we need only the merged escapes.
 			testReasonsFound := make(map[EscapeReason]bool)
-			for _, e := range es {
+			for r := EscapeReason(0); r < reasonCount; r++ {
+				if merged.CallSites[r] == nil {
+					continue
+				}
 				// Is this local?
-				local, ok := testReasons[e.Reason]
-				wantLocal := len(e.Chain) == 1
-				testReasonsFound[e.Reason] = wantLocal
+				wantLocal, ok := testReasons[r]
+				isLocal := len(merged.CallSites[r]) == 1
+				testReasonsFound[r] = isLocal
 				if !ok {
 					continue
 				}
-				if local == wantLocal {
-					delete(testReasons, e.Reason)
+				if isLocal == wantLocal {
+					delete(testReasons, r)
 				}
 			}
 			for reason, local := range testReasons {
@@ -714,10 +908,8 @@ func run(pass *analysis.Pass) (interface{}, error) {
 				pass.Reportf(fdecl.Pos(), fmt.Sprintf("testescapes not found: reason=%s, local=%t", reason, local))
 			}
 			if len(testReasons) > 0 {
-				// Dump all reasons found to help in debugging.
-				for _, e := range es {
-					pass.Reportf(e.Chain[0].LocalPos, "escape found: %s", e.String())
-				}
+				// Report for debugging.
+				merged.Reportf(pass)
 			}
 		}
 	}
diff --git a/tools/checkescape/test1/test1.go b/tools/checkescape/test1/test1.go
index 68d3f72cc..27991649f 100644
--- a/tools/checkescape/test1/test1.go
+++ b/tools/checkescape/test1/test1.go
@@ -17,7 +17,6 @@ package test1
 
 import (
 	"fmt"
-	"reflect"
 )
 
 // Interface is a generic interface.
@@ -163,20 +162,6 @@ func dynamicRec(f func()) {
 	Dynamic(f)
 }
 
-// +mustescape:local,unknown
-//go:noinline
-//go:nosplit
-func Unknown() {
-	_ = reflect.TypeOf((*Type)(nil)) // Does not actually escape.
-}
-
-// +mustescape:unknown
-//go:noinline
-//go:nosplit
-func unknownRec() {
-	Unknown()
-}
-
 //go:noinline
 //go:nosplit
 func internalFunc() {
@@ -190,6 +175,7 @@ func Split() {
 
 // +mustescape:stack
 //go:noinline
+//go:nosplit
 func splitRec() {
 	Split()
 }
diff --git a/tools/checkescape/test2/test2.go b/tools/checkescape/test2/test2.go
index 7fce3e3be..067d5a1f4 100644
--- a/tools/checkescape/test2/test2.go
+++ b/tools/checkescape/test2/test2.go
@@ -81,14 +81,9 @@ func dynamicCrossPkg(f func()) {
 	test1.Dynamic(f)
 }
 
-// +mustescape:unknown
-//go:noinline
-func unknownCrossPkg() {
-	test1.Unknown()
-}
-
 // +mustescape:stack
 //go:noinline
+//go:nosplit
 func splitCrosssPkt() {
 	test1.Split()
 }
diff --git a/tools/defs.bzl b/tools/defs.bzl
index e71a26cf4..d75e40863 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,55 +7,90 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _bzl_library = "bzl_library", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path", _vdso_linker_option = "vdso_linker_option")
+load("//tools/nogo:defs.bzl", "nogo_test")
+load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _bzl_library = "bzl_library", _coreutil = "coreutil", _default_installer = "default_installer", _default_net_util = "default_net_util", _loopback = "loopback", _proto_library = "proto_library", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _short_path = "short_path")
+load("//tools/bazeldefs:cc.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _gbenchmark = "gbenchmark", _grpcpp = "grpcpp", _gtest = "gtest", _vdso_linker_option = "vdso_linker_option")
+load("//tools/bazeldefs:go.bzl", _gazelle = "gazelle", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _select_goarch = "select_goarch", _select_goos = "select_goos")
+load("//tools/bazeldefs:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
 load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
 load("//tools/bazeldefs:tags.bzl", "go_suffixes")
-load("//tools/nogo:defs.bzl", "nogo_test")
 
-# Delegate directly.
+# Core rules.
 build_test = _build_test
 bzl_library = _bzl_library
+default_installer = _default_installer
+default_net_util = _default_net_util
+loopback = _loopback
+select_arch = _select_arch
+select_system = _select_system
+short_path = _short_path
+rbe_platform = _rbe_platform
+rbe_toolchain = _rbe_toolchain
+coreutil = _coreutil
+
+# C++ rules.
 cc_binary = _cc_binary
 cc_flags_supplier = _cc_flags_supplier
 cc_grpc_library = _cc_grpc_library
 cc_library = _cc_library
 cc_test = _cc_test
 cc_toolchain = _cc_toolchain
-default_installer = _default_installer
-default_net_util = _default_net_util
 gbenchmark = _gbenchmark
+gtest = _gtest
+grpcpp = _grpcpp
+vdso_linker_option = _vdso_linker_option
+
+# Go rules.
 gazelle = _gazelle
 go_embed_data = _go_embed_data
 go_path = _go_path
-go_test = _go_test
-gtest = _gtest
-grpcpp = _grpcpp
-loopback = _loopback
+select_goos = _select_goos
+select_goarch = _select_goarch
+
+# Packaging rules.
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
-py_binary = _py_binary
-select_arch = _select_arch
-select_system = _select_system
-short_path = _short_path
-rbe_platform = _rbe_platform
-rbe_toolchain = _rbe_toolchain
-vdso_linker_option = _vdso_linker_option
 
 # Platform options.
 default_platform = _default_platform
 platforms = _platforms
 
-def go_binary(name, **kwargs):
+def go_binary(name, nogo = True, pure = False, static = False, x_defs = None, **kwargs):
     """Wraps the standard go_binary.
 
     Args:
       name: the rule name.
+      nogo: enable nogo analysis.
+      pure: build a pure Go (no CGo) binary.
+      static: build a static binary.
+      x_defs: additional linker definitions.
       **kwargs: standard go_binary arguments.
     """
     _go_binary(
         name = name,
+        pure = pure,
+        static = static,
+        x_defs = x_defs,
         **kwargs
     )
+    if nogo:
+        # Note that the nogo rule applies only for go_library and go_test
+        # targets, therefore we construct a library from the binary sources.
+        # This is done because the binary may not be in a form that objdump
+        # supports (i.e. a pure Go binary).
+        _go_library(
+            name = name + "_nogo_library",
+            srcs = kwargs.get("srcs", []),
+            deps = kwargs.get("deps", []),
+            testonly = 1,
+        )
+        nogo_test(
+            name = name + "_nogo",
+            config = "//:nogo_config",
+            srcs = kwargs.get("srcs", []),
+            deps = [":" + name + "_nogo_library"],
+            tags = ["nogo"],
+        )
 
 def calculate_sets(srcs):
     """Calculates special Go sets for templates.
@@ -119,6 +154,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
       stateify: whether statify is enabled (default: true).
       marshal: whether marshal is enabled (default: false).
       marshal_debug: whether the gomarshal tools emits debugging output (default: false).
+      nogo: enable nogo analysis.
       **kwargs: standard go_library arguments.
     """
     all_srcs = srcs
@@ -184,7 +220,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
     if nogo:
         nogo_test(
             name = name + "_nogo",
+            config = "//:nogo_config",
+            srcs = all_srcs,
             deps = [":" + name],
+            tags = ["nogo"],
         )
 
     if marshal:
@@ -196,12 +235,36 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         for (suffix, _) in marshal_sets.items():
             _go_test(
                 name = name + suffix + "_abi_autogen_test",
-                srcs = [name + suffix + "_abi_autogen_test.go"],
+                srcs = [
+                    name + suffix + "_abi_autogen_test.go",
+                    name + suffix + "_abi_autogen_unconditional_test.go",
+                ],
                 library = ":" + name,
                 deps = marshal_test_deps,
                 **kwargs
             )
 
+def go_test(name, nogo = True, **kwargs):
+    """Wraps the standard go_test.
+
+    Args:
+      name: the rule name.
+      nogo: enable nogo analysis.
+      **kwargs: standard go_test arguments.
+    """
+    _go_test(
+        name = name,
+        **kwargs
+    )
+    if nogo:
+        nogo_test(
+            name = name + "_nogo",
+            config = "//:nogo_config",
+            srcs = kwargs.get("srcs", []),
+            deps = [":" + name],
+            tags = ["nogo"],
+        )
+
 def proto_library(name, srcs, deps = None, has_services = 0, **kwargs):
     """Wraps the standard proto_library.
 
diff --git a/tools/github/BUILD b/tools/github/BUILD
new file mode 100644
index 000000000..aad088d13
--- /dev/null
+++ b/tools/github/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "github",
+    srcs = ["main.go"],
+    nogo = False,
+    deps = [
+        "//tools/github/nogo",
+        "//tools/github/reviver",
+        "@com_github_google_go_github_v28//github:go_default_library",
+        "@org_golang_x_oauth2//:go_default_library",
+    ],
+)
diff --git a/tools/github/main.go b/tools/github/main.go
new file mode 100644
index 000000000..681003eef
--- /dev/null
+++ b/tools/github/main.go
@@ -0,0 +1,186 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary github is the entry point for GitHub utilities.
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"os/exec"
+	"strings"
+
+	"github.com/google/go-github/github"
+	"golang.org/x/oauth2"
+	"gvisor.dev/gvisor/tools/github/nogo"
+	"gvisor.dev/gvisor/tools/github/reviver"
+)
+
+var (
+	owner     string
+	repo      string
+	tokenFile string
+	paths     stringList
+	commit    string
+	dryRun    bool
+)
+
+type stringList []string
+
+func (s *stringList) String() string {
+	return strings.Join(*s, ",")
+}
+
+func (s *stringList) Set(value string) error {
+	*s = append(*s, value)
+	return nil
+}
+
+// Keep the options simple for now. Supports only a single path and repo.
+func init() {
+	flag.StringVar(&owner, "owner", "", "GitHub project org/owner (required, except nogo dry-run)")
+	flag.StringVar(&repo, "repo", "", "GitHub repo (required, except nogo dry-run)")
+	flag.StringVar(&tokenFile, "oauth-token-file", "", "file containing the GitHub token (or GITHUB_TOKEN is set)")
+	flag.Var(&paths, "path", "path(s) to scan (required for revive and nogo)")
+	flag.StringVar(&commit, "commit", "", "commit to associated (required for nogo, except dry-run)")
+	flag.BoolVar(&dryRun, "dry-run", false, "just print changes to be made")
+}
+
+func filterPaths(paths []string) (existing []string) {
+	for _, path := range paths {
+		if _, err := os.Stat(path); err != nil {
+			log.Printf("WARNING: skipping %v: %v", path, err)
+			continue
+		}
+		existing = append(existing, path)
+	}
+	return
+}
+
+func main() {
+	// Set defaults from the environment.
+	repository := os.Getenv("GITHUB_REPOSITORY")
+	if parts := strings.SplitN(repository, "/", 2); len(parts) == 2 {
+		owner = parts[0]
+		repo = parts[1]
+	}
+
+	// Parse flags.
+	flag.Usage = func() {
+		fmt.Fprintf(flag.CommandLine.Output(), "usage: %s [options] <command>\n", os.Args[0])
+		fmt.Fprintf(flag.CommandLine.Output(), "commands: revive, nogo\n")
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+	args := flag.Args()
+	if len(args) != 1 {
+		fmt.Fprintf(flag.CommandLine.Output(), "extra arguments: %s\n", strings.Join(args[1:], ", "))
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	// Check for mandatory parameters.
+	command := args[0]
+	if len(owner) == 0 && (command != "nogo" || !dryRun) {
+		fmt.Fprintln(flag.CommandLine.Output(), "missing --owner option.")
+		flag.Usage()
+		os.Exit(1)
+	}
+	if len(repo) == 0 && (command != "nogo" || !dryRun) {
+		fmt.Fprintln(flag.CommandLine.Output(), "missing --repo option.")
+		flag.Usage()
+		os.Exit(1)
+	}
+	filteredPaths := filterPaths(paths)
+	if len(filteredPaths) == 0 {
+		fmt.Fprintln(flag.CommandLine.Output(), "no valid --path options provided.")
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	// The access token may be passed as a file so it doesn't show up in
+	// command line arguments. It also may be provided through the
+	// environment to faciliate use through GitHub's CI system.
+	token := os.Getenv("GITHUB_TOKEN")
+	if len(tokenFile) != 0 {
+		bytes, err := ioutil.ReadFile(tokenFile)
+		if err != nil {
+			fmt.Println(err.Error())
+			os.Exit(1)
+		}
+		token = string(bytes)
+	}
+	var client *github.Client
+	if len(token) == 0 {
+		// Client is unauthenticated.
+		client = github.NewClient(nil)
+	} else {
+		// Using the above token.
+		ts := oauth2.StaticTokenSource(
+			&oauth2.Token{AccessToken: token},
+		)
+		tc := oauth2.NewClient(context.Background(), ts)
+		client = github.NewClient(tc)
+	}
+
+	switch command {
+	case "revive":
+		// Load existing GitHub bugs.
+		bugger, err := reviver.NewGitHubBugger(client, owner, repo, dryRun)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error getting github issues: %v\n", err)
+			os.Exit(1)
+		}
+		// Scan the provided path.
+		rev := reviver.New(filteredPaths, []reviver.Bugger{bugger})
+		if errs := rev.Run(); len(errs) > 0 {
+			fmt.Fprintf(os.Stderr, "Encountered %d errors:\n", len(errs))
+			for _, err := range errs {
+				fmt.Fprintf(os.Stderr, "\t%v\n", err)
+			}
+			os.Exit(1)
+		}
+	case "nogo":
+		// Did we get a commit? Try to extract one.
+		if len(commit) == 0 && !dryRun {
+			cmd := exec.Command("git", "rev-parse", "HEAD")
+			revBytes, err := cmd.Output()
+			if err != nil {
+				fmt.Fprintf(flag.CommandLine.Output(), "missing --commit option, unable to infer: %v\n", err)
+				flag.Usage()
+				os.Exit(1)
+			}
+			commit = strings.TrimSpace(string(revBytes))
+		}
+		// Scan all findings.
+		poster := nogo.NewFindingsPoster(client, owner, repo, commit, dryRun)
+		if err := poster.Walk(filteredPaths); err != nil {
+			fmt.Fprintln(os.Stderr, "Error finding nogo findings:", err)
+			os.Exit(1)
+		}
+		// Post to GitHub.
+		if err := poster.Post(); err != nil {
+			fmt.Fprintln(os.Stderr, "Error posting nogo findings:", err)
+		}
+	default:
+		// Not a known command.
+		fmt.Fprintf(flag.CommandLine.Output(), "unknown command: %s\n", command)
+		flag.Usage()
+		os.Exit(1)
+	}
+}
diff --git a/tools/github/nogo/BUILD b/tools/github/nogo/BUILD
new file mode 100644
index 000000000..19b7eec4d
--- /dev/null
+++ b/tools/github/nogo/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nogo",
+    srcs = ["nogo.go"],
+    nogo = False,
+    visibility = [
+        "//tools/github:__subpackages__",
+    ],
+    deps = [
+        "//tools/nogo",
+        "@com_github_google_go_github_v28//github:go_default_library",
+    ],
+)
diff --git a/tools/github/nogo/nogo.go b/tools/github/nogo/nogo.go
new file mode 100644
index 000000000..27ab1b8eb
--- /dev/null
+++ b/tools/github/nogo/nogo.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nogo provides nogo-related utilities.
+package nogo
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/google/go-github/github"
+	"gvisor.dev/gvisor/tools/nogo"
+)
+
+// FindingsPoster is a simple wrapper around the GitHub api.
+type FindingsPoster struct {
+	owner     string
+	repo      string
+	commit    string
+	dryRun    bool
+	startTime time.Time
+
+	findings map[nogo.Finding]struct{}
+	client   *github.Client
+}
+
+// NewFindingsPoster returns a object that can post findings.
+func NewFindingsPoster(client *github.Client, owner, repo, commit string, dryRun bool) *FindingsPoster {
+	return &FindingsPoster{
+		owner:     owner,
+		repo:      repo,
+		commit:    commit,
+		dryRun:    dryRun,
+		startTime: time.Now(),
+		findings:  make(map[nogo.Finding]struct{}),
+		client:    client,
+	}
+}
+
+// Walk walks the given path tree for findings files.
+func (p *FindingsPoster) Walk(paths []string) error {
+	for _, path := range paths {
+		if err := filepath.Walk(path, func(filename string, info os.FileInfo, err error) error {
+			if err != nil {
+				return err
+			}
+			// Skip any directories or files not ending in .findings.
+			if !strings.HasSuffix(filename, ".findings") || info.IsDir() {
+				return nil
+			}
+			findings, err := nogo.ExtractFindingsFromFile(filename)
+			if err != nil {
+				return err
+			}
+			// Add all findings to the list. We use a map to ensure
+			// that each finding is unique.
+			for _, finding := range findings {
+				p.findings[finding] = struct{}{}
+			}
+			return nil
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Post posts all results to the GitHub API as a check run.
+func (p *FindingsPoster) Post() error {
+	// Just show results?
+	if p.dryRun {
+		for finding, _ := range p.findings {
+			// Pretty print, so that this is useful for debugging.
+			fmt.Printf("%s: (%s+%d) %s\n", finding.Category, finding.Position.Filename, finding.Position.Line, finding.Message)
+		}
+		return nil
+	}
+
+	// Construct the message.
+	title := "nogo"
+	count := len(p.findings)
+	status := "completed"
+	conclusion := "success"
+	if count > 0 {
+		conclusion = "failure" // Contains errors.
+	}
+	summary := fmt.Sprintf("%d findings.", count)
+	opts := github.CreateCheckRunOptions{
+		Name:        title,
+		HeadSHA:     p.commit,
+		Status:      &status,
+		Conclusion:  &conclusion,
+		StartedAt:   &github.Timestamp{p.startTime},
+		CompletedAt: &github.Timestamp{time.Now()},
+		Output: &github.CheckRunOutput{
+			Title:            &title,
+			Summary:          &summary,
+			AnnotationsCount: &count,
+		},
+	}
+	annotationLevel := "failure" // Always.
+	for finding, _ := range p.findings {
+		title := string(finding.Category)
+		opts.Output.Annotations = append(opts.Output.Annotations, &github.CheckRunAnnotation{
+			Path:            &finding.Position.Filename,
+			StartLine:       &finding.Position.Line,
+			EndLine:         &finding.Position.Line,
+			Message:         &finding.Message,
+			Title:           &title,
+			AnnotationLevel: &annotationLevel,
+		})
+	}
+
+	// Post to GitHub.
+	_, _, err := p.client.Checks.CreateCheckRun(context.Background(), p.owner, p.repo, opts)
+	return err
+}
diff --git a/tools/github/reviver/BUILD b/tools/github/reviver/BUILD
new file mode 100644
index 000000000..7d78480a7
--- /dev/null
+++ b/tools/github/reviver/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "reviver",
+    srcs = [
+        "github.go",
+        "reviver.go",
+    ],
+    nogo = False,
+    visibility = [
+        "//tools/github:__subpackages__",
+    ],
+    deps = ["@com_github_google_go_github_v28//github:go_default_library"],
+)
+
+go_test(
+    name = "reviver_test",
+    size = "small",
+    srcs = [
+        "github_test.go",
+        "reviver_test.go",
+    ],
+    library = ":reviver",
+    nogo = False,
+)
diff --git a/tools/issue_reviver/github/github.go b/tools/github/reviver/github.go
index 8ffd7e606..c4b624f2a 100644
--- a/tools/issue_reviver/github/github.go
+++ b/tools/github/reviver/github.go
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package github implements reviver.Bugger interface on top of Github issues.
-package github
+package reviver
 
 import (
 	"context"
@@ -23,12 +22,10 @@ import (
 	"time"
 
 	"github.com/google/go-github/github"
-	"golang.org/x/oauth2"
-	"gvisor.dev/gvisor/tools/issue_reviver/reviver"
 )
 
-// Bugger implements reviver.Bugger interface for github issues.
-type Bugger struct {
+// GitHubBugger implements Bugger interface for github issues.
+type GitHubBugger struct {
 	owner  string
 	repo   string
 	dryRun bool
@@ -37,36 +34,25 @@ type Bugger struct {
 	issues map[int]*github.Issue
 }
 
-// NewBugger creates a new Bugger.
-func NewBugger(token, owner, repo string, dryRun bool) (*Bugger, error) {
-	b := &Bugger{
+// NewGitHubBugger creates a new GitHubBugger.
+func NewGitHubBugger(client *github.Client, owner, repo string, dryRun bool) (*GitHubBugger, error) {
+	b := &GitHubBugger{
 		owner:  owner,
 		repo:   repo,
 		dryRun: dryRun,
 		issues: map[int]*github.Issue{},
+		client: client,
 	}
-	if err := b.load(token); err != nil {
+	if err := b.load(); err != nil {
 		return nil, err
 	}
 	return b, nil
 }
 
-func (b *Bugger) load(token string) error {
-	ctx := context.Background()
-	if len(token) == 0 {
-		fmt.Print("No OAUTH token provided, using unauthenticated account.\n")
-		b.client = github.NewClient(nil)
-	} else {
-		ts := oauth2.StaticTokenSource(
-			&oauth2.Token{AccessToken: token},
-		)
-		tc := oauth2.NewClient(ctx, ts)
-		b.client = github.NewClient(tc)
-	}
-
+func (b *GitHubBugger) load() error {
 	err := processAllPages(func(listOpts github.ListOptions) (*github.Response, error) {
 		opts := &github.IssueListByRepoOptions{State: "open", ListOptions: listOpts}
-		tmps, resp, err := b.client.Issues.ListByRepo(ctx, b.owner, b.repo, opts)
+		tmps, resp, err := b.client.Issues.ListByRepo(context.Background(), b.owner, b.repo, opts)
 		if err != nil {
 			return resp, err
 		}
@@ -83,8 +69,8 @@ func (b *Bugger) load(token string) error {
 	return nil
 }
 
-// Activate implements reviver.Bugger.
-func (b *Bugger) Activate(todo *reviver.Todo) (bool, error) {
+// Activate implements Bugger.Activate.
+func (b *GitHubBugger) Activate(todo *Todo) (bool, error) {
 	id, err := parseIssueNo(todo.Issue)
 	if err != nil {
 		return true, err
@@ -135,13 +121,24 @@ func (b *Bugger) Activate(todo *reviver.Todo) (bool, error) {
 	return true, nil
 }
 
+var issuePrefixes = []string{
+	"gvisor.dev/issue/",
+	"gvisor.dev/issues/",
+}
+
 // parseIssueNo parses the issue number out of the issue url.
+//
+// 0 is returned if url does not correspond to an issue.
 func parseIssueNo(url string) (int, error) {
-	const prefix = "gvisor.dev/issue/"
-
 	// First check if I can handle the TODO.
-	idStr := strings.TrimPrefix(url, prefix)
-	if len(url) == len(idStr) {
+	var idStr string
+	for _, p := range issuePrefixes {
+		if str := strings.TrimPrefix(url, p); len(str) < len(url) {
+			idStr = str
+			break
+		}
+	}
+	if len(idStr) == 0 {
 		return 0, nil
 	}
 
diff --git a/tools/issue_reviver/github/github_test.go b/tools/github/reviver/github_test.go
index a78b230ef..5df7e3624 100644
--- a/tools/issue_reviver/github/github_test.go
+++ b/tools/github/reviver/github_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package github
+package reviver
 
 import (
 	"testing"
diff --git a/tools/issue_reviver/reviver/reviver.go b/tools/github/reviver/reviver.go
index 2af7f0d59..2af7f0d59 100644
--- a/tools/issue_reviver/reviver/reviver.go
+++ b/tools/github/reviver/reviver.go
diff --git a/tools/issue_reviver/reviver/reviver_test.go b/tools/github/reviver/reviver_test.go
index a9fb1f9f1..851306c9d 100644
--- a/tools/issue_reviver/reviver/reviver_test.go
+++ b/tools/github/reviver/reviver_test.go
@@ -33,6 +33,15 @@ func TestProcessLine(t *testing.T) {
 			},
 		},
 		{
+			line: "// TODO(foobar.com/issues/123): comment, bla. blabla.",
+			want: &Todo{
+				Issue: "foobar.com/issues/123",
+				Locations: []Location{
+					{Comment: "comment, bla. blabla."},
+				},
+			},
+		},
+		{
 			line: "// FIXME(b/123): internal bug",
 			want: &Todo{
 				Issue: "b/123",
diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl
index 33329cf28..ad97208a8 100644
--- a/tools/go_generics/defs.bzl
+++ b/tools/go_generics/defs.bzl
@@ -1,25 +1,32 @@
-"""Generics support via go_generics."""
+"""Generics support via go_generics.
+
+A Go template is similar to a go library, except that it has certain types that
+can be replaced before usage. For example, one could define a templatized List
+struct, whose elements are of type T, then instantiate that template for
+T=segment, where "segment" is the concrete type.
+"""
 
 TemplateInfo = provider(
+    "Information about a go_generics template.",
     fields = {
+        "unsafe": "whether the template requires unsafe code",
         "types": "required types",
         "opt_types": "optional types",
         "consts": "required consts",
         "opt_consts": "optional consts",
         "deps": "package dependencies",
-        "file": "merged template",
+        "template": "merged template source file",
     },
 )
 
 def _go_template_impl(ctx):
     srcs = ctx.files.srcs
-    output = ctx.outputs.out
-
-    args = ["-o=%s" % output.path] + [f.path for f in srcs]
+    template = ctx.actions.declare_file(ctx.label.name + "_template.go")
+    args = ["-o=%s" % template.path] + [f.path for f in srcs]
 
     ctx.actions.run(
         inputs = srcs,
-        outputs = [output],
+        outputs = [template],
         mnemonic = "GoGenericsTemplate",
         progress_message = "Building Go template %s" % ctx.label,
         arguments = args,
@@ -32,74 +39,48 @@ def _go_template_impl(ctx):
         consts = ctx.attr.consts,
         opt_consts = ctx.attr.opt_consts,
         deps = ctx.attr.deps,
-        file = output,
+        template = template,
     )]
 
-"""
-Generates a Go template from a set of Go files.
-
-A Go template is similar to a go library, except that it has certain types that
-can be replaced before usage. For example, one could define a templatized List
-struct, whose elements are of type T, then instantiate that template for
-T=segment, where "segment" is the concrete type.
-
-Args:
-  name: the name of the template.
-  srcs: the list of source files that comprise the template.
-  types: the list of generic types in the template that are required to be specified.
-  opt_types: the list of generic types in the template that can but aren't required to be specified.
-  consts: the list of constants in the template that are required to be specified.
-  opt_consts: the list of constants in the template that can but aren't required to be specified.
-  deps: the list of dependencies.
-"""
 go_template = rule(
     implementation = _go_template_impl,
     attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "deps": attr.label_list(allow_files = True, cfg = "target"),
-        "types": attr.string_list(),
-        "opt_types": attr.string_list(),
-        "consts": attr.string_list(),
-        "opt_consts": attr.string_list(),
+        "srcs": attr.label_list(doc = "the list of source files that comprise the template", mandatory = True, allow_files = True),
+        "deps": attr.label_list(doc = "the standard dependency list", allow_files = True, cfg = "target"),
+        "types": attr.string_list(doc = "the list of generic types in the template that are required to be specified"),
+        "opt_types": attr.string_list(doc = "the list of generic types in the template that can but aren't required to be specified"),
+        "consts": attr.string_list(doc = "the list of constants in the template that are required to be specified"),
+        "opt_consts": attr.string_list(doc = "the list of constants in the template that can but aren't required to be specified"),
         "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_generics/go_merge")),
     },
-    outputs = {
-        "out": "%{name}_template.go",
-    },
-)
-
-TemplateInstanceInfo = provider(
-    fields = {
-        "srcs": "source files",
-    },
 )
 
 def _go_template_instance_impl(ctx):
-    template = ctx.attr.template[TemplateInfo]
+    info = ctx.attr.template[TemplateInfo]
     output = ctx.outputs.out
 
     # Check that all required types are defined.
-    for t in template.types:
+    for t in info.types:
         if t not in ctx.attr.types:
             fail("Missing value for type %s in %s" % (t, ctx.attr.template.label))
 
     # Check that all defined types are expected by the template.
     for t in ctx.attr.types:
-        if (t not in template.types) and (t not in template.opt_types):
+        if (t not in info.types) and (t not in info.opt_types):
             fail("Type %s it not a parameter to %s" % (t, ctx.attr.template.label))
 
     # Check that all required consts are defined.
-    for t in template.consts:
+    for t in info.consts:
         if t not in ctx.attr.consts:
             fail("Missing value for constant %s in %s" % (t, ctx.attr.template.label))
 
     # Check that all defined consts are expected by the template.
     for t in ctx.attr.consts:
-        if (t not in template.consts) and (t not in template.opt_consts):
+        if (t not in info.consts) and (t not in info.opt_consts):
             fail("Const %s it not a parameter to %s" % (t, ctx.attr.template.label))
 
     # Build the argument list.
-    args = ["-i=%s" % template.file.path, "-o=%s" % output.path]
+    args = ["-i=%s" % info.template.path, "-o=%s" % output.path]
     if ctx.attr.package:
         args.append("-p=%s" % ctx.attr.package)
 
@@ -117,7 +98,7 @@ def _go_template_instance_impl(ctx):
         args.append("-anon")
 
     ctx.actions.run(
-        inputs = [template.file],
+        inputs = [info.template],
         outputs = [output],
         mnemonic = "GoGenericsInstance",
         progress_message = "Building Go template instance %s" % ctx.label,
@@ -125,35 +106,22 @@ def _go_template_instance_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-    return [TemplateInstanceInfo(
-        srcs = [output],
+    return [DefaultInfo(
+        files = depset([output]),
     )]
 
-"""
-Instantiates a Go template by replacing all generic types with concrete ones.
-
-Args:
-  name: the name of the template instance.
-  template: the label of the template to be instatiated.
-  prefix: a prefix to be added to globals in the template.
-  suffix: a suffix to be added to global in the template.
-  types: the map from generic type names to concrete ones.
-  consts: the map from constant names to their values.
-  imports: the map from imports used in types/consts to their import paths.
-  package: the name of the package the instantiated template will be compiled into.
-"""
 go_template_instance = rule(
     implementation = _go_template_instance_impl,
     attrs = {
-        "template": attr.label(mandatory = True),
-        "prefix": attr.string(),
-        "suffix": attr.string(),
-        "types": attr.string_dict(),
-        "consts": attr.string_dict(),
-        "imports": attr.string_dict(),
-        "anon": attr.bool(mandatory = False, default = False),
-        "package": attr.string(mandatory = False),
-        "out": attr.output(mandatory = True),
+        "template": attr.label(doc = "the label of the template to be instantiated", mandatory = True),
+        "prefix": attr.string(doc = "a prefix to be added to globals in the template"),
+        "suffix": attr.string(doc = "a suffix to be added to globals in the template"),
+        "types": attr.string_dict(doc = "the map from generic type names to concrete ones"),
+        "consts": attr.string_dict(doc = "the map from constant names to their values"),
+        "imports": attr.string_dict(doc = "the map from imports used in types/consts to their import paths"),
+        "anon": attr.bool(doc = "whether anoymous fields should be processed", mandatory = False, default = False),
+        "package": attr.string(doc = "the package for the generated source file", mandatory = False),
+        "out": attr.output(doc = "output file", mandatory = True),
         "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_generics")),
     },
 )
diff --git a/tools/go_generics/go_merge/main.go b/tools/go_generics/go_merge/main.go
index f6a331123..e0345500f 100644
--- a/tools/go_generics/go_merge/main.go
+++ b/tools/go_generics/go_merge/main.go
@@ -77,6 +77,7 @@ func main() {
 	// Create a new declaration slice with all imports at the top, merging any
 	// redundant imports.
 	imports := make(map[string]*ast.ImportSpec)
+	var importNames []string // Keep imports in the original order to get deterministic output.
 	var anonImports []*ast.ImportSpec
 	for _, d := range f.Decls {
 		if g, ok := d.(*ast.GenDecl); ok && g.Tok == token.IMPORT {
@@ -98,6 +99,7 @@ func main() {
 						}
 					} else {
 						imports[n] = i
+						importNames = append(importNames, n)
 					}
 				}
 			}
@@ -112,8 +114,8 @@ func main() {
 			Lparen: token.NoPos + 1,
 			Specs:  make([]ast.Spec, 0, l),
 		}
-		for _, i := range imports {
-			d.Specs = append(d.Specs, i)
+		for _, i := range importNames {
+			d.Specs = append(d.Specs, imports[i])
 		}
 		for _, i := range anonImports {
 			d.Specs = append(d.Specs, i)
diff --git a/tools/go_generics/imports.go b/tools/go_generics/imports.go
index 148dc7216..90d3aa1e0 100644
--- a/tools/go_generics/imports.go
+++ b/tools/go_generics/imports.go
@@ -21,6 +21,7 @@ import (
 	"go/format"
 	"go/parser"
 	"go/token"
+	"sort"
 	"strconv"
 
 	"gvisor.dev/gvisor/tools/go_generics/globals"
@@ -132,10 +133,17 @@ func updateImports(maps []mapValue, imports mapValue) (ast.Decl, error) {
 	if len(importsUsed) == 0 {
 		return nil, nil
 	}
+	var names []string
+	for n := range importsUsed {
+		names = append(names, n)
+	}
+	// Sort the new imports for deterministic build outputs.
+	sort.Strings(names)
 
 	// Create spec array for each new import.
 	specs := make([]ast.Spec, 0, len(importsUsed))
-	for _, i := range importsUsed {
+	for _, n := range names {
+		i := importsUsed[n]
 		specs = append(specs, &ast.ImportSpec{
 			Name: &ast.Ident{Name: i.newName},
 			Path: &ast.BasicLit{Value: i.path},
diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md
index 68d759083..d8045c295 100644
--- a/tools/go_marshal/README.md
+++ b/tools/go_marshal/README.md
@@ -3,18 +3,19 @@ This package implements the go_marshal utility.
 # Overview
 
 `go_marshal` is a code generation utility similar to `go_stateify` for
-automatically generating code to marshal go data structures to memory.
+marshalling go data structures to and from memory.
 
 `go_marshal` attempts to improve on `binary.Write` and the sentry's
-`binary.Marshal` by moving the go runtime reflection necessary to marshal a
-struct to compile-time.
+`binary.Marshal` by moving the expensive use of reflection from runtime to
+compile-time.
 
 `go_marshal` automatically generates implementations for `marshal.Marshallable`
-and `safemem.{Reader,Writer}`. Data structures that require custom serialization
-will have manual implementations for these interfaces.
+interface. Data structures that require custom serialization can be accomodated
+through a manual implementation this interface.
 
 Data structures can be flagged for code generation by adding a struct-level
-comment `// +marshal`.
+comment `// +marshal`. For additional details and options, see the documentation
+for the `marshal.Marshallable` interface.
 
 # Usage
 
@@ -74,7 +75,7 @@ intended for ABI structs, which have these additional restrictions:
     dependent native pointer size.
 
 -   Fields must either be a primitive integer type (`byte`,
-    `[u]int{8,16,32,64}`), or of a type that implements abi.Marshallable.
+    `[u]int{8,16,32,64}`), or of a type that implements `marshal.Marshallable`.
 
 -   `int` and `uint` fields are not allowed. Use an explicitly-sized numeric
     type.
@@ -112,3 +113,18 @@ The following are some guidelines for modifying the `go_marshal` tool:
 -   No runtime reflection in the code generated for the marshallable interface.
     The entire point of the tool is to avoid runtime reflection. The generated
     tests may use reflection.
+
+## Debugging
+
+To enable debugging output from the go-marshal tool, use one of the following
+options, depending on how go-marshal is being invoked:
+
+-   Pass `--define gomarshal=verbose` to the bazel command. Note that this can
+    generate a lot of output depending on what's being compiled, as this will
+    enable debugging for all packages built by the command.
+
+-   Set `marshal_debug = True` on the top-level `go_library` BUILD rule.
+
+-   Set `debug = True` on the `go_marshal` BUILD rule.
+
+-   Pass `-debug` to the go-marshal tool invocation.
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index 323e33882..f44f83eab 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -4,11 +4,13 @@ def _go_marshal_impl(ctx):
     """Execute the go_marshal tool."""
     output = ctx.outputs.lib
     output_test = ctx.outputs.test
+    output_test_unconditional = ctx.outputs.test_unconditional
 
     # Run the marshal command.
     args = ["-output=%s" % output.path]
-    args += ["-pkg=%s" % ctx.attr.package]
-    args += ["-output_test=%s" % output_test.path]
+    args.append("-pkg=%s" % ctx.attr.package)
+    args.append("-output_test=%s" % output_test.path)
+    args.append("-output_test_unconditional=%s" % output_test_unconditional.path)
 
     if ctx.attr.debug:
         args += ["-debug"]
@@ -18,7 +20,7 @@ def _go_marshal_impl(ctx):
         args += [f.path for f in src.files.to_list()]
     ctx.actions.run(
         inputs = ctx.files.srcs,
-        outputs = [output, output_test],
+        outputs = [output, output_test, output_test_unconditional],
         mnemonic = "GoMarshal",
         progress_message = "go_marshal: %s" % ctx.label,
         arguments = args,
@@ -48,6 +50,7 @@ go_marshal = rule(
     outputs = {
         "lib": "%{name}_unsafe.go",
         "test": "%{name}_test.go",
+        "test_unconditional": "%{name}_unconditional_test.go",
     },
 )
 
@@ -56,7 +59,7 @@ marshal_deps = [
     "//pkg/gohacks",
     "//pkg/safecopy",
     "//pkg/usermem",
-    "//tools/go_marshal/marshal",
+    "//pkg/marshal",
 ]
 
 # marshal_test_deps are required by test targets.
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 19bcd4e6a..4a53d25be 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -38,8 +38,8 @@ import (
 // All recievers are single letters, so we don't allow import aliases to be a
 // single letter.
 var badIdents = []string{
-	"addr", "blk", "buf", "dst", "dsts", "count", "err", "hdr", "idx", "inner",
-	"length", "limit", "ptr", "size", "src", "srcs", "task", "val",
+	"addr", "blk", "buf", "cc", "dst", "dsts", "count", "err", "hdr", "idx",
+	"inner", "length", "limit", "ptr", "size", "src", "srcs", "val",
 	// All single-letter identifiers.
 }
 
@@ -68,6 +68,8 @@ type Generator struct {
 	output *os.File
 	// Output file to write generated tests.
 	outputTest *os.File
+	// Output file to write unconditionally generated tests.
+	outputTestUC *os.File
 	// Package name for the generated file.
 	pkg string
 	// Set of extra packages to import in the generated file.
@@ -75,21 +77,26 @@ type Generator struct {
 }
 
 // NewGenerator creates a new code Generator.
-func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*Generator, error) {
+func NewGenerator(srcs []string, out, outTest, outTestUnconditional, pkg string, imports []string) (*Generator, error) {
 	f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
 	if err != nil {
-		return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err)
+		return nil, fmt.Errorf("couldn't open output file %q: %w", out, err)
 	}
 	fTest, err := os.OpenFile(outTest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
 	if err != nil {
-		return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err)
+		return nil, fmt.Errorf("couldn't open test output file %q: %w", out, err)
+	}
+	fTestUC, err := os.OpenFile(outTestUnconditional, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+	if err != nil {
+		return nil, fmt.Errorf("couldn't open unconditional test output file %q: %w", out, err)
 	}
 	g := Generator{
-		inputs:     srcs,
-		output:     f,
-		outputTest: fTest,
-		pkg:        pkg,
-		imports:    newImportTable(),
+		inputs:       srcs,
+		output:       f,
+		outputTest:   fTest,
+		outputTestUC: fTestUC,
+		pkg:          pkg,
+		imports:      newImportTable(),
 	}
 	for _, i := range imports {
 		// All imports on the extra imports list are unconditionally marked as
@@ -107,7 +114,7 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 	g.imports.add("gvisor.dev/gvisor/pkg/gohacks")
 	g.imports.add("gvisor.dev/gvisor/pkg/safecopy")
 	g.imports.add("gvisor.dev/gvisor/pkg/usermem")
-	g.imports.add("gvisor.dev/gvisor/tools/go_marshal/marshal")
+	g.imports.add("gvisor.dev/gvisor/pkg/marshal")
 
 	return &g, nil
 }
@@ -174,7 +181,7 @@ func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) {
 		f, err := parser.ParseFile(fset, path, nil, parser.ParseComments)
 		if err != nil {
 			// Not a valid input file?
-			return nil, nil, fmt.Errorf("Input %q can't be parsed: %v", path, err)
+			return nil, nil, fmt.Errorf("input %q can't be parsed: %w", path, err)
 		}
 
 		if debugEnabled() {
@@ -454,6 +461,46 @@ func (g *Generator) Run() error {
 // source file.
 func (g *Generator) writeTests(ts []*testGenerator) error {
 	var b sourceBuffer
+
+	// Write the unconditional test file. This file is always compiled,
+	// regardless of what build tags were specified on the original input
+	// files. We use this file to guarantee we never end up with an empty test
+	// file, as that causes the build to fail with "no tests/benchmarks/examples
+	// found".
+	//
+	// There's no easy way to determine ahead of time if we'll end up with an
+	// empty build file since build constraints can arbitrarily cause some of
+	// the original types to be not defined. We also have no way to tell bazel
+	// to omit the entire test suite since the output files are already defined
+	// before go-marshal is called.
+	b.emit("// Automatically generated marshal tests. See tools/go_marshal.\n\n")
+	b.emit("package %s\n\n", g.pkg)
+	b.emit("func Example() {\n")
+	b.inIndent(func() {
+		b.emit("// This example is intentionally empty, and ensures this package contains at\n")
+		b.emit("// least one testable entity. go-marshal is forced to emit a test package if the\n")
+		b.emit("// input package is marked marshallable, but emitting no testable entities \n")
+		b.emit("// results in a build failure.\n")
+	})
+	b.emit("}\n")
+	if err := b.write(g.outputTestUC); err != nil {
+		return err
+	}
+
+	// Now generate the real test file that contains the real types we
+	// processed. These need to be conditionally compiled according to the build
+	// tags, as the original types may not be defined under all build
+	// configurations.
+
+	b.reset()
+	b.emit("// Automatically generated marshal tests. See tools/go_marshal.\n\n")
+
+	// Emit build tags.
+	if t := tags.Aggregate(g.inputs); len(t) > 0 {
+		b.emit(strings.Join(t.Lines(), "\n"))
+		b.emit("\n\n")
+	}
+
 	b.emit("package %s\n\n", g.pkg)
 	if err := b.write(g.outputTest); err != nil {
 		return err
@@ -470,26 +517,6 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 	}
 
 	// Write test functions.
-
-	// If we didn't generate any Marshallable implementations, we can't just
-	// emit an empty test file, since that causes the build to fail with "no
-	// tests/benchmarks/examples found". Unfortunately we can't signal bazel to
-	// omit the entire package since the outputs are already defined before
-	// go-marshal is called. If we'd otherwise emit an empty test suite, emit an
-	// empty example instead.
-	if len(ts) == 0 {
-		b.reset()
-		b.emit("func Example() {\n")
-		b.inIndent(func() {
-			b.emit("// This example is intentionally empty to ensure this file contains at least\n")
-			b.emit("// one testable entity. go-marshal is forced to emit a test file if a package\n")
-			b.emit("// is marked marshallable, but emitting a test file with no entities results\n")
-			b.emit("// in a build failure.\n")
-		})
-		b.emit("}\n")
-		return b.write(g.outputTest)
-	}
-
 	for _, t := range ts {
 		if err := t.write(g.outputTest); err != nil {
 			return err
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index e3c3dac63..36447b86b 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -43,8 +43,8 @@ type interfaceGenerator struct {
 	// of t's interfaces.
 	ms map[string]struct{}
 
-	// as records embedded fields in t that are potentially not packed. The key
-	// is the accessor for the field.
+	// as records fields in t that are potentially not packed. The key is the
+	// accessor for the field.
 	as map[string]struct{}
 }
 
@@ -224,7 +224,7 @@ func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string)
 func (g *interfaceGenerator) emitKeepAlive(ptrVar string) {
 	g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar)
 	g.emit("// must live until the use above.\n")
-	g.emit("runtime.KeepAlive(%s)\n", ptrVar)
+	g.emit("runtime.KeepAlive(%s) // escapes: replaced by intrinsic.\n", ptrVar)
 }
 
 func (g *interfaceGenerator) expandBinaryExpr(b *strings.Builder, e *ast.BinaryExpr) {
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
index 72ef03a22..7525b52da 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -102,11 +102,11 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 
 	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
@@ -114,19 +114,19 @@ func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *as
 
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+		g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r)
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
index 39f654ea8..7edaf666c 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
@@ -154,11 +154,11 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 
 	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
@@ -166,19 +166,19 @@ func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident)
 
 	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+		g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r)
 	})
 	g.emit("}\n\n")
 
 	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
 	g.emit("//go:nosplit\n")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive(g.r)
 		g.emit("return length, err\n")
 	})
@@ -211,7 +211,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType)
 	g.emit("//go:nosplit\n")
-	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType)
+	g.emit("func Copy%sIn(cc marshal.CopyContext, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType)
 	g.inIndent(func() {
 		g.emit("count := len(dst)\n")
 		g.emit("if count == 0 {\n")
@@ -223,7 +223,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 		g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
 
-		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive("dst")
 		g.emit("return length, err\n")
 	})
@@ -231,7 +231,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType)
 	g.emit("//go:nosplit\n")
-	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType)
+	g.emit("func Copy%sOut(cc marshal.CopyContext, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType)
 	g.inIndent(func() {
 		g.emit("count := len(src)\n")
 		g.emit("if count == 0 {\n")
@@ -243,7 +243,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Id
 
 		g.emitCastSliceToByteSlice("&src", "buf", "size * count")
 
-		g.emit("length, err := task.CopyOutBytes(addr, buf) // escapes: okay.\n")
+		g.emit("length, err := cc.CopyOutBytes(addr, buf) // escapes: okay.\n")
 		g.emitKeepAlive("src")
 		g.emit("return length, err\n")
 	})
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
index 4b9cea08a..fe76d3785 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces_struct.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -20,6 +20,7 @@ package gomarshal
 import (
 	"fmt"
 	"go/ast"
+	"sort"
 	"strings"
 )
 
@@ -40,6 +41,8 @@ func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
 	for accessor, _ := range g.as {
 		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
 	}
+	// Sort expressions for determinstic build outputs.
+	sort.Strings(cs)
 	return strings.Join(cs, " && "), true
 }
 
@@ -48,12 +51,6 @@ func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
 // later.
 func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType) {
 	forEachStructField(st, func(f *ast.Field) {
-		if len(f.Names) == 0 {
-			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
-		}
-	})
-
-	forEachStructField(st, func(f *ast.Field) {
 		fieldDispatcher{
 			primitive: func(_, t *ast.Ident) {
 				g.validatePrimitiveNewtype(t)
@@ -98,7 +95,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 		var dynamicSizeTerms []string
 
 		forEachStructField(st, fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
+			primitive: func(_, t *ast.Ident) {
 				if size, dynamic := g.scalarSize(t); !dynamic {
 					primitiveSize += size
 				} else {
@@ -106,13 +103,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name))
 				}
 			},
-			selector: func(n, tX, tSel *ast.Ident) {
+			selector: func(_, tX, tSel *ast.Ident) {
 				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
 				g.recordUsedImport(tX.Name)
 				g.recordUsedMarshallable(tName)
 				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
 			},
-			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+			array: func(_ *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
 				lenExpr := g.arrayLenExpr(a)
 				if size, dynamic := g.scalarSize(t); !dynamic {
 					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%d*%s", size, lenExpr))
@@ -323,13 +320,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
 			g.emit("%s.MarshalBytes(buf) // escapes: fallback.\n", g.r)
-			g.emit("return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+			g.emit("return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -343,7 +340,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			// Fast serialization.
 			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-			g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+			g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
 			g.emitKeepAlive(g.r)
 			g.emit("return length, err\n")
 		} else {
@@ -356,9 +353,9 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
-		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+		g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r)
 	})
 	g.emit("}\n\n")
 
@@ -366,12 +363,12 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 	g.emit("//go:nosplit\n")
 	g.recordUsedImport("marshal")
 	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
-			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+			g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n")
 			g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n")
 			g.emit("// partially unmarshalled struct.\n")
 			g.emit("%s.UnmarshalBytes(buf) // escapes: fallback.\n", g.r)
@@ -389,7 +386,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			// Fast deserialization.
 			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+			g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n")
 			g.emitKeepAlive(g.r)
 			g.emit("return length, err\n")
 		} else {
@@ -400,13 +397,13 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 
 	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
 	g.recordUsedImport("io")
-	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.emit("func (%s *%s) WriteTo(writer io.Writer) (int64, error) {\n", g.r, g.typeName())
 	g.inIndent(func() {
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
 			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
 			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("length, err := w.Write(buf)\n")
+			g.emit("length, err := writer.Write(buf)\n")
 			g.emit("return int64(length), err\n")
 		}
 		if thisPacked {
@@ -421,7 +418,7 @@ func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
 			// Fast serialization.
 			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
 
-			g.emit("length, err := w.Write(buf)\n")
+			g.emit("length, err := writer.Write(buf)\n")
 			g.emitKeepAlive(g.r)
 			g.emit("return int64(length), err\n")
 		} else {
@@ -442,7 +439,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 	g.recordUsedImport("usermem")
 
 	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, g.typeName())
-	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.emit("func Copy%sIn(cc marshal.CopyContext, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName())
 	g.inIndent(func() {
 		g.emit("count := len(dst)\n")
 		g.emit("if count == 0 {\n")
@@ -454,8 +451,8 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
-			g.emit("length, err := task.CopyInBytes(addr, buf)\n\n")
+			g.emit("buf := cc.CopyScratchBuffer(size * count)\n")
+			g.emit("length, err := cc.CopyInBytes(addr, buf)\n\n")
 
 			g.emit("// Unmarshal as much as possible, even on error. First handle full objects.\n")
 			g.emit("limit := length/size\n")
@@ -489,7 +486,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 			// Fast deserialization.
 			g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
 
-			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emit("length, err := cc.CopyInBytes(addr, buf)\n")
 			g.emitKeepAlive("dst")
 			g.emit("return length, err\n")
 		} else {
@@ -499,7 +496,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 	g.emit("}\n\n")
 
 	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, g.typeName())
-	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.emit("func Copy%sOut(cc marshal.CopyContext, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName())
 	g.inIndent(func() {
 		g.emit("count := len(src)\n")
 		g.emit("if count == 0 {\n")
@@ -511,13 +508,13 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 
 		fallback := func() {
 			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
+			g.emit("buf := cc.CopyScratchBuffer(size * count)\n")
 			g.emit("for idx := 0; idx < count; idx++ {\n")
 			g.inIndent(func() {
 				g.emit("src[idx].MarshalBytes(buf[size*idx:size*(idx+1)])\n")
 			})
 			g.emit("}\n")
-			g.emit("return task.CopyOutBytes(addr, buf)\n")
+			g.emit("return cc.CopyOutBytes(addr, buf)\n")
 		}
 		if thisPacked {
 			g.recordUsedImport("reflect")
@@ -531,7 +528,7 @@ func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType,
 			// Fast serialization.
 			g.emitCastSliceToByteSlice("&src", "buf", "size * count")
 
-			g.emit("length, err := task.CopyOutBytes(addr, buf)\n")
+			g.emit("length, err := cc.CopyOutBytes(addr, buf)\n")
 			g.emitKeepAlive("src")
 			g.emit("return length, err\n")
 		} else {
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index d94314302..6a42691cd 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -79,7 +79,7 @@ type fieldDispatcher struct {
 }
 
 // Precondition: All dispatch callbacks that will be invoked must be
-// provided. Embedded fields are not allowed, len(f.Names) >= 1.
+// provided.
 func (fd fieldDispatcher) dispatch(f *ast.Field) {
 	// Each field declaration may actually be multiple declarations of the same
 	// type. For example, consider:
@@ -88,12 +88,24 @@ func (fd fieldDispatcher) dispatch(f *ast.Field) {
 	//     x, y, z int
 	// }
 	//
-	// We invoke the call-backs once per such instance. Embedded fields are not
-	// allowed, and results in a panic.
+	// We invoke the call-backs once per such instance.
+
+	// Handle embedded fields. Embedded fields have no names, but can be
+	// referenced by the type name.
 	if len(f.Names) < 1 {
-		panic("Precondition not met: attempted to dispatch on embedded field")
+		switch v := f.Type.(type) {
+		case *ast.Ident:
+			fd.primitive(v, v)
+		case *ast.SelectorExpr:
+			fd.selector(v.Sel, v.X.(*ast.Ident), v.Sel)
+		default:
+			// Note: Arrays can't be embedded, which is handled here.
+			panic(fmt.Sprintf("Attempted to dispatch on embedded field of unsupported kind: %#v", f.Type))
+		}
+		return
 	}
 
+	// Non-embedded field.
 	for _, name := range f.Names {
 		switch v := f.Type.(type) {
 		case *ast.Ident:
diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go
index f74be5c29..6e4a3e8c4 100644
--- a/tools/go_marshal/main.go
+++ b/tools/go_marshal/main.go
@@ -31,10 +31,11 @@ import (
 )
 
 var (
-	pkg        = flag.String("pkg", "", "output package")
-	output     = flag.String("output", "", "output file")
-	outputTest = flag.String("output_test", "", "output file for tests")
-	imports    = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
+	pkg                     = flag.String("pkg", "", "output package")
+	output                  = flag.String("output", "", "output file")
+	outputTest              = flag.String("output_test", "", "output file for tests")
+	outputTestUnconditional = flag.String("output_test_unconditional", "", "output file for unconditional tests")
+	imports                 = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
 )
 
 func main() {
@@ -61,7 +62,7 @@ func main() {
 		// as an import.
 		extraImports = strings.Split(*imports, ",")
 	}
-	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, extraImports)
+	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *outputTestUnconditional, *pkg, extraImports)
 	if err != nil {
 		panic(err)
 	}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index 3d989823a..4b27773c2 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -35,10 +35,10 @@ go_test(
     srcs = ["marshal_test.go"],
     deps = [
         ":test",
+        "//pkg/marshal",
         "//pkg/syserror",
         "//pkg/usermem",
         "//tools/go_marshal/analysis",
-        "//tools/go_marshal/marshal",
         "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/tools/go_marshal/test/escape/BUILD b/tools/go_marshal/test/escape/BUILD
index f74e6ffae..2981ef196 100644
--- a/tools/go_marshal/test/escape/BUILD
+++ b/tools/go_marshal/test/escape/BUILD
@@ -7,8 +7,8 @@ go_library(
     testonly = 1,
     srcs = ["escape.go"],
     deps = [
+        "//pkg/marshal",
         "//pkg/usermem",
-        "//tools/go_marshal/marshal",
         "//tools/go_marshal/test",
     ],
 )
diff --git a/tools/go_marshal/test/escape/escape.go b/tools/go_marshal/test/escape/escape.go
index 6a46ddbf8..7f62b0a2b 100644
--- a/tools/go_marshal/test/escape/escape.go
+++ b/tools/go_marshal/test/escape/escape.go
@@ -15,34 +15,34 @@
 package escape
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 	"gvisor.dev/gvisor/tools/go_marshal/test"
 )
 
-// dummyTask implements marshal.Task.
-type dummyTask struct {
+// dummyCopyContext implements marshal.CopyContext.
+type dummyCopyContext struct {
 }
 
-func (*dummyTask) CopyScratchBuffer(size int) []byte {
+func (*dummyCopyContext) CopyScratchBuffer(size int) []byte {
 	return make([]byte, size)
 }
 
-func (*dummyTask) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+func (*dummyCopyContext) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
 	return len(b), nil
 }
 
-func (*dummyTask) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+func (*dummyCopyContext) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
 	return len(b), nil
 }
 
-func (t *dummyTask) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
+func (t *dummyCopyContext) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
 	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
 	marshallable.MarshalBytes(buf)
 	t.CopyOutBytes(addr, buf)
 }
 
-func (t *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
+func (t *dummyCopyContext) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
 	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
 	marshallable.MarshalUnsafe(buf)
 	t.CopyOutBytes(addr, buf)
@@ -50,21 +50,22 @@ func (t *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marsha
 
 // +checkescape:all
 //go:nosplit
-func doCopyIn(t *dummyTask) {
+func doCopyIn(t *dummyCopyContext) {
 	var stat test.Stat
 	stat.CopyIn(t, usermem.Addr(0xf000ba12))
 }
 
 // +checkescape:all
 //go:nosplit
-func doCopyOut(t *dummyTask) {
+func doCopyOut(t *dummyCopyContext) {
 	var stat test.Stat
 	stat.CopyOut(t, usermem.Addr(0xf000ba12))
 }
 
 // +mustescape:builtin
 // +mustescape:stack
-func doMarshalBytesDirect(t *dummyTask) {
+//go:nosplit
+func doMarshalBytesDirect(t *dummyCopyContext) {
 	var stat test.Stat
 	buf := t.CopyScratchBuffer(stat.SizeBytes())
 	stat.MarshalBytes(buf)
@@ -73,7 +74,8 @@ func doMarshalBytesDirect(t *dummyTask) {
 
 // +mustescape:builtin
 // +mustescape:stack
-func doMarshalUnsafeDirect(t *dummyTask) {
+//go:nosplit
+func doMarshalUnsafeDirect(t *dummyCopyContext) {
 	var stat test.Stat
 	buf := t.CopyScratchBuffer(stat.SizeBytes())
 	stat.MarshalUnsafe(buf)
@@ -82,14 +84,16 @@ func doMarshalUnsafeDirect(t *dummyTask) {
 
 // +mustescape:local,heap
 // +mustescape:stack
-func doMarshalBytesViaMarshallable(t *dummyTask) {
+//go:nosplit
+func doMarshalBytesViaMarshallable(t *dummyCopyContext) {
 	var stat test.Stat
 	t.MarshalBytes(usermem.Addr(0xf000ba12), &stat)
 }
 
 // +mustescape:local,heap
 // +mustescape:stack
-func doMarshalUnsafeViaMarshallable(t *dummyTask) {
+//go:nosplit
+func doMarshalUnsafeViaMarshallable(t *dummyCopyContext) {
 	var stat test.Stat
 	t.MarshalUnsafe(usermem.Addr(0xf000ba12), &stat)
 }
diff --git a/tools/go_marshal/test/marshal_test.go b/tools/go_marshal/test/marshal_test.go
index 16829ee45..a00f9a684 100644
--- a/tools/go_marshal/test/marshal_test.go
+++ b/tools/go_marshal/test/marshal_test.go
@@ -27,22 +27,22 @@ import (
 	"unsafe"
 
 	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/tools/go_marshal/analysis"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 	"gvisor.dev/gvisor/tools/go_marshal/test"
 )
 
 var simulatedErr error = syserror.EFAULT
 
-// mockTask implements marshal.Task.
-type mockTask struct {
+// mockCopyContext implements marshal.CopyContext.
+type mockCopyContext struct {
 	taskMem usermem.BytesIO
 }
 
 // populate fills the task memory with the contents of val.
-func (t *mockTask) populate(val interface{}) {
+func (t *mockCopyContext) populate(val interface{}) {
 	var buf bytes.Buffer
 	// Use binary.Write so we aren't testing go-marshal against its own
 	// potentially buggy implementation.
@@ -52,7 +52,7 @@ func (t *mockTask) populate(val interface{}) {
 	t.taskMem.Bytes = buf.Bytes()
 }
 
-func (t *mockTask) setLimit(n int) {
+func (t *mockCopyContext) setLimit(n int) {
 	if len(t.taskMem.Bytes) < n {
 		grown := make([]byte, n)
 		copy(grown, t.taskMem.Bytes)
@@ -62,22 +62,22 @@ func (t *mockTask) setLimit(n int) {
 	t.taskMem.Bytes = t.taskMem.Bytes[:n]
 }
 
-// CopyScratchBuffer implements marshal.Task.CopyScratchBuffer.
-func (t *mockTask) CopyScratchBuffer(size int) []byte {
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (t *mockCopyContext) CopyScratchBuffer(size int) []byte {
 	return make([]byte, size)
 }
 
-// CopyOutBytes implements marshal.Task.CopyOutBytes. The implementation
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. The implementation
 // completely ignores the target address and stores a copy of b in its
 // internally buffer, overriding any previous contents.
-func (t *mockTask) CopyOutBytes(_ usermem.Addr, b []byte) (int, error) {
+func (t *mockCopyContext) CopyOutBytes(_ usermem.Addr, b []byte) (int, error) {
 	return t.taskMem.CopyOut(nil, 0, b, usermem.IOOpts{})
 }
 
-// CopyInBytes implements marshal.Task.CopyInBytes. The implementation
+// CopyInBytes implements marshal.CopyContext.CopyInBytes. The implementation
 // completely ignores the source address and always fills b from the begining of
 // its internal buffer.
-func (t *mockTask) CopyInBytes(_ usermem.Addr, b []byte) (int, error) {
+func (t *mockCopyContext) CopyInBytes(_ usermem.Addr, b []byte) (int, error) {
 	return t.taskMem.CopyIn(nil, 0, b, usermem.IOOpts{})
 }
 
@@ -171,11 +171,11 @@ func compareMemory(t *testing.T, expected, actual []byte, n int) {
 // dst. The task signals an error at limit bytes during copy-in, which should
 // result in a truncated unmarshalling.
 func limitedCopyIn(t *testing.T, src, dst marshal.Marshallable, limit int) {
-	var task mockTask
-	task.populate(src)
-	task.setLimit(limit)
+	var cc mockCopyContext
+	cc.populate(src)
+	cc.setLimit(limit)
 
-	n, err := dst.CopyIn(&task, usermem.Addr(0))
+	n, err := dst.CopyIn(&cc, usermem.Addr(0))
 	if n != limit {
 		t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
 	}
@@ -202,10 +202,10 @@ func limitedCopyIn(t *testing.T, src, dst marshal.Marshallable, limit int) {
 // limitedCopyOut marshals src to task memory. The task signals an error at
 // limit bytes during copy-out, which should result in a truncated marshalling.
 func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) {
-	var task mockTask
-	task.setLimit(limit)
+	var cc mockCopyContext
+	cc.setLimit(limit)
 
-	n, err := src.CopyOut(&task, usermem.Addr(0))
+	n, err := src.CopyOut(&cc, usermem.Addr(0))
 	if n != limit {
 		t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n)
 	}
@@ -215,7 +215,7 @@ func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) {
 
 	expectedMem := unsafeMemory(src)
 	defer runtime.KeepAlive(src)
-	actualMem := task.taskMem.Bytes
+	actualMem := cc.taskMem.Bytes
 
 	compareMemory(t, expectedMem, actualMem, n)
 }
@@ -223,10 +223,10 @@ func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) {
 // copyOutN marshals src to task memory, requesting the marshalling to be
 // limited to limit bytes.
 func copyOutN(t *testing.T, src marshal.Marshallable, limit int) {
-	var task mockTask
-	task.setLimit(limit)
+	var cc mockCopyContext
+	cc.setLimit(limit)
 
-	n, err := src.CopyOutN(&task, usermem.Addr(0), limit)
+	n, err := src.CopyOutN(&cc, usermem.Addr(0), limit)
 	if err != nil {
 		t.Errorf("CopyOut returned unexpected error: %v", err)
 	}
@@ -236,7 +236,7 @@ func copyOutN(t *testing.T, src marshal.Marshallable, limit int) {
 
 	expectedMem := unsafeMemory(src)
 	defer runtime.KeepAlive(src)
-	actualMem := task.taskMem.Bytes
+	actualMem := cc.taskMem.Bytes
 
 	t.Logf("Expected: %v + %v\n", expectedMem[:n], expectedMem[n:])
 	t.Logf("Actual  : %v + %v\n", actualMem[:n], actualMem[n:])
@@ -303,20 +303,20 @@ func TestLimitedMarshalling(t *testing.T) {
 func TestLimitedSliceMarshalling(t *testing.T) {
 	types := []struct {
 		arrayPtrType reflect.Type
-		copySliceIn  func(task marshal.Task, addr usermem.Addr, dstSlice interface{}) (int, error)
-		copySliceOut func(task marshal.Task, addr usermem.Addr, srcSlice interface{}) (int, error)
+		copySliceIn  func(cc marshal.CopyContext, addr usermem.Addr, dstSlice interface{}) (int, error)
+		copySliceOut func(cc marshal.CopyContext, addr usermem.Addr, srcSlice interface{}) (int, error)
 		unsafeMemory func(arrPtr interface{}) []byte
 	}{
 		// Packed types.
 		{
 			reflect.TypeOf((*[20]test.Stat)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[20]test.Stat)[:]
-				return test.CopyStatSliceIn(task, addr, slice)
+				return test.CopyStatSliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[20]test.Stat)[:]
-				return test.CopyStatSliceOut(task, addr, slice)
+				return test.CopyStatSliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[20]test.Stat)[:]
@@ -325,13 +325,13 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 		},
 		{
 			reflect.TypeOf((*[1]test.Stat)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[1]test.Stat)[:]
-				return test.CopyStatSliceIn(task, addr, slice)
+				return test.CopyStatSliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[1]test.Stat)[:]
-				return test.CopyStatSliceOut(task, addr, slice)
+				return test.CopyStatSliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[1]test.Stat)[:]
@@ -340,13 +340,13 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 		},
 		{
 			reflect.TypeOf((*[5]test.SignalSetAlias)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[5]test.SignalSetAlias)[:]
-				return test.CopySignalSetAliasSliceIn(task, addr, slice)
+				return test.CopySignalSetAliasSliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[5]test.SignalSetAlias)[:]
-				return test.CopySignalSetAliasSliceOut(task, addr, slice)
+				return test.CopySignalSetAliasSliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[5]test.SignalSetAlias)[:]
@@ -356,13 +356,13 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 		// Non-packed types.
 		{
 			reflect.TypeOf((*[20]test.Type1)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[20]test.Type1)[:]
-				return test.CopyType1SliceIn(task, addr, slice)
+				return test.CopyType1SliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[20]test.Type1)[:]
-				return test.CopyType1SliceOut(task, addr, slice)
+				return test.CopyType1SliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[20]test.Type1)[:]
@@ -371,13 +371,13 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 		},
 		{
 			reflect.TypeOf((*[1]test.Type1)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[1]test.Type1)[:]
-				return test.CopyType1SliceIn(task, addr, slice)
+				return test.CopyType1SliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[1]test.Type1)[:]
-				return test.CopyType1SliceOut(task, addr, slice)
+				return test.CopyType1SliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[1]test.Type1)[:]
@@ -386,13 +386,13 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 		},
 		{
 			reflect.TypeOf((*[7]test.Type8)(nil)),
-			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, dst interface{}) (int, error) {
 				slice := dst.(*[7]test.Type8)[:]
-				return test.CopyType8SliceIn(task, addr, slice)
+				return test.CopyType8SliceIn(cc, addr, slice)
 			},
-			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+			func(cc marshal.CopyContext, addr usermem.Addr, src interface{}) (int, error) {
 				slice := src.(*[7]test.Type8)[:]
-				return test.CopyType8SliceOut(task, addr, slice)
+				return test.CopyType8SliceOut(cc, addr, slice)
 			},
 			func(a interface{}) []byte {
 				slice := a.(*[7]test.Type8)[:]
@@ -439,11 +439,11 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 			limit += elem.SizeBytes() / 2
 			analysis.RandomizeValue(expected)
 
-			var task mockTask
-			task.populate(expected)
-			task.setLimit(limit)
+			var cc mockCopyContext
+			cc.populate(expected)
+			cc.setLimit(limit)
 
-			n, err := tt.copySliceIn(&task, usermem.Addr(0), actual)
+			n, err := tt.copySliceIn(&cc, usermem.Addr(0), actual)
 			if n != limit {
 				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
 			}
@@ -493,11 +493,11 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 			limit += elem.SizeBytes() / 2
 			analysis.RandomizeValue(expected)
 
-			var task mockTask
-			task.populate(expected)
-			task.setLimit(limit)
+			var cc mockCopyContext
+			cc.populate(expected)
+			cc.setLimit(limit)
 
-			n, err := tt.copySliceOut(&task, usermem.Addr(0), expected)
+			n, err := tt.copySliceOut(&cc, usermem.Addr(0), expected)
 			if n != limit {
 				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
 			}
@@ -507,7 +507,7 @@ func TestLimitedSliceMarshalling(t *testing.T) {
 
 			expectedMem := tt.unsafeMemory(expected)
 			defer runtime.KeepAlive(expected)
-			actualMem := task.taskMem.Bytes
+			actualMem := cc.taskMem.Bytes
 
 			compareMemory(t, expectedMem, actualMem, n)
 		})
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index f75ca1b7f..d9e9f341b 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -174,3 +174,27 @@ type Type9 struct {
 	x int64
 	y [sizeA]int32
 }
+
+// Type10Embed is a test data type which is be embedded into another type.
+//
+// +marshal
+type Type10Embed struct {
+	x int64
+}
+
+// Type10 is a test data type which contains an embedded struct.
+//
+// +marshal
+type Type10 struct {
+	Type10Embed
+	y int64
+}
+
+// Type11 is a test data type which contains an embedded struct from an external
+// package.
+//
+// +marshal
+type Type11 struct {
+	ex.External
+	y int64
+}
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 4f6ed208a..e1de12e25 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -39,7 +39,7 @@ var (
 )
 
 // resolveTypeName returns a qualified type name.
-func resolveTypeName(name string, typ ast.Expr) (field string, qualified string) {
+func resolveTypeName(typ ast.Expr) (field string, qualified string) {
 	for done := false; !done; {
 		// Resolve star expressions.
 		switch rs := typ.(type) {
@@ -69,11 +69,7 @@ func resolveTypeName(name string, typ ast.Expr) (field string, qualified string)
 	}
 
 	// Figure out actual type name.
-	ident, ok := typ.(*ast.Ident)
-	if !ok {
-		panic(fmt.Sprintf("type not supported: %s (involves anonymous types?)", name))
-	}
-	field = ident.Name
+	field = typ.(*ast.Ident).Name
 	qualified = qualified + field
 	return
 }
@@ -119,7 +115,7 @@ func scanFields(ss *ast.StructType, prefix string, fn scanFunctions) {
 		} else {
 			// Anonymous types can't be embedded, so we don't need
 			// to worry about providing a useful name here.
-			name, _ = resolveTypeName("", field.Type)
+			name, _ = resolveTypeName(field.Type)
 		}
 
 		// Skip _ fields.
@@ -214,9 +210,6 @@ func main() {
 	emitRegister := func(name string) {
 		initCalls = append(initCalls, fmt.Sprintf("%sRegister((*%s)(nil))", statePrefix, name))
 	}
-	emitZeroCheck := func(name string) {
-		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(&x.%s) { %sFailf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, statePrefix, name, name)
-	}
 
 	// Automated warning.
 	fmt.Fprint(outputFile, "// automatically generated by stateify.\n\n")
@@ -265,52 +258,39 @@ func main() {
 	}
 
 	type method struct {
-		receiver string
-		name     string
+		typeName   string
+		methodName string
 	}
 
-	// Search for and add all methods with a pointer receiver and no other
-	// arguments to a set. We support auto-detecting the existence of
-	// several different methods with this signature.
-	simpleMethods := map[method]struct{}{}
+	// Search for and add all method to a set. We auto-detecting several
+	// different methods (and insert them if we don't find them, in order
+	// to ensure that expectations match reality).
+	//
+	// While we do this, figure out the right receiver name. If there are
+	// multiple distinct receivers, then we will just pick the last one.
+	simpleMethods := make(map[method]struct{})
+	receiverNames := make(map[string]string)
 	for _, f := range files {
-
 		// Go over all functions.
 		for _, decl := range f.Decls {
 			d, ok := decl.(*ast.FuncDecl)
 			if !ok {
 				continue
 			}
-			if d.Name == nil || d.Recv == nil || d.Type == nil {
+			if d.Recv == nil || len(d.Recv.List) != 1 {
 				// Not a named method.
 				continue
 			}
-			if len(d.Recv.List) != 1 {
-				// Wrong number of receivers?
-				continue
-			}
-			if d.Type.Params != nil && len(d.Type.Params.List) != 0 {
-				// Has argument(s).
-				continue
-			}
-			if d.Type.Results != nil && len(d.Type.Results.List) != 0 {
-				// Has return(s).
-				continue
-			}
 
-			pt, ok := d.Recv.List[0].Type.(*ast.StarExpr)
-			if !ok {
-				// Not a pointer receiver.
-				continue
+			// Save the method and the receiver.
+			name, _ := resolveTypeName(d.Recv.List[0].Type)
+			simpleMethods[method{
+				typeName:   name,
+				methodName: d.Name.Name,
+			}] = struct{}{}
+			if len(d.Recv.List[0].Names) > 0 {
+				receiverNames[name] = d.Recv.List[0].Names[0].Name
 			}
-
-			t, ok := pt.X.(*ast.Ident)
-			if !ok {
-				// This shouldn't happen with valid Go.
-				continue
-			}
-
-			simpleMethods[method{t.Name, d.Name.Name}] = struct{}{}
 		}
 	}
 
@@ -349,6 +329,11 @@ func main() {
 
 			for _, gs := range d.Specs {
 				ts := gs.(*ast.TypeSpec)
+				recv, ok := receiverNames[ts.Name.Name]
+				if !ok {
+					// Maybe no methods were defined?
+					recv = strings.ToLower(ts.Name.Name[:1])
+				}
 				switch x := ts.Type.(type) {
 				case *ast.StructType:
 					maybeEmitImports()
@@ -365,29 +350,32 @@ func main() {
 						emitField(name)
 					}
 					emitLoadValue := func(name, typName string) {
-						fmt.Fprintf(outputFile, "	m.LoadValue(%d, new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", fields[name], typName, camelCased(name), typName)
+						fmt.Fprintf(outputFile, "	stateSourceObject.LoadValue(%d, new(%s), func(y interface{}) { %s.load%s(y.(%s)) })\n", fields[name], typName, recv, camelCased(name), typName)
 					}
 					emitLoad := func(name string) {
-						fmt.Fprintf(outputFile, "	m.Load(%d, &x.%s)\n", fields[name], name)
+						fmt.Fprintf(outputFile, "	stateSourceObject.Load(%d, &%s.%s)\n", fields[name], recv, name)
 					}
 					emitLoadWait := func(name string) {
-						fmt.Fprintf(outputFile, "	m.LoadWait(%d, &x.%s)\n", fields[name], name)
+						fmt.Fprintf(outputFile, "	stateSourceObject.LoadWait(%d, &%s.%s)\n", fields[name], recv, name)
 					}
 					emitSaveValue := func(name, typName string) {
-						fmt.Fprintf(outputFile, "	var %s %s = x.save%s()\n", name, typName, camelCased(name))
-						fmt.Fprintf(outputFile, "	m.SaveValue(%d, %s)\n", fields[name], name)
+						fmt.Fprintf(outputFile, "	var %sValue %s = %s.save%s()\n", name, typName, recv, camelCased(name))
+						fmt.Fprintf(outputFile, "	stateSinkObject.SaveValue(%d, %sValue)\n", fields[name], name)
 					}
 					emitSave := func(name string) {
-						fmt.Fprintf(outputFile, "	m.Save(%d, &x.%s)\n", fields[name], name)
+						fmt.Fprintf(outputFile, "	stateSinkObject.Save(%d, &%s.%s)\n", fields[name], recv, name)
+					}
+					emitZeroCheck := func(name string) {
+						fmt.Fprintf(outputFile, "	if !%sIsZeroValue(&%s.%s) { %sFailf(\"%s is %%#v, expected zero\", &%s.%s) }\n", statePrefix, recv, name, statePrefix, name, recv, name)
 					}
 
 					// Generate the type name method.
-					fmt.Fprintf(outputFile, "func (x *%s) StateTypeName() string {\n", ts.Name.Name)
+					fmt.Fprintf(outputFile, "func (%s *%s) StateTypeName() string {\n", recv, ts.Name.Name)
 					fmt.Fprintf(outputFile, "	return \"%s.%s\"\n", *fullPkg, ts.Name.Name)
 					fmt.Fprintf(outputFile, "}\n\n")
 
 					// Generate the fields method.
-					fmt.Fprintf(outputFile, "func (x *%s) StateFields() []string {\n", ts.Name.Name)
+					fmt.Fprintf(outputFile, "func (%s *%s) StateFields() []string {\n", recv, ts.Name.Name)
 					fmt.Fprintf(outputFile, "	return []string{\n")
 					scanFields(x, "", scanFunctions{
 						normal: emitField,
@@ -401,8 +389,11 @@ func main() {
 					// the code from compiling if a custom beforeSave was defined in a
 					// file not provided to this binary and prevents inherited methods
 					// from being called multiple times by overriding them.
-					if _, ok := simpleMethods[method{ts.Name.Name, "beforeSave"}]; !ok && generateSaverLoader {
-						fmt.Fprintf(outputFile, "func (x *%s) beforeSave() {}\n\n", ts.Name.Name)
+					if _, ok := simpleMethods[method{
+						typeName:   ts.Name.Name,
+						methodName: "beforeSave",
+					}]; !ok && generateSaverLoader {
+						fmt.Fprintf(outputFile, "func (%s *%s) beforeSave() {}\n\n", recv, ts.Name.Name)
 					}
 
 					// Generate the save method.
@@ -412,8 +403,8 @@ func main() {
 					// on this specific behavior, but the ability to specify slots
 					// allows a manual implementation to be order-dependent.
 					if generateSaverLoader {
-						fmt.Fprintf(outputFile, "func (x *%s) StateSave(m %sSink) {\n", ts.Name.Name, statePrefix)
-						fmt.Fprintf(outputFile, "	x.beforeSave()\n")
+						fmt.Fprintf(outputFile, "func (%s *%s) StateSave(stateSinkObject %sSink) {\n", recv, ts.Name.Name, statePrefix)
+						fmt.Fprintf(outputFile, "	%s.beforeSave()\n", recv)
 						scanFields(x, "", scanFunctions{zerovalue: emitZeroCheck})
 						scanFields(x, "", scanFunctions{value: emitSaveValue})
 						scanFields(x, "", scanFunctions{normal: emitSave, wait: emitSave})
@@ -422,16 +413,19 @@ func main() {
 
 					// Define afterLoad if a definition was not found. We do this for
 					// the same reason that we do it for beforeSave.
-					_, hasAfterLoad := simpleMethods[method{ts.Name.Name, "afterLoad"}]
+					_, hasAfterLoad := simpleMethods[method{
+						typeName:   ts.Name.Name,
+						methodName: "afterLoad",
+					}]
 					if !hasAfterLoad && generateSaverLoader {
-						fmt.Fprintf(outputFile, "func (x *%s) afterLoad() {}\n\n", ts.Name.Name)
+						fmt.Fprintf(outputFile, "func (%s *%s) afterLoad() {}\n\n", recv, ts.Name.Name)
 					}
 
 					// Generate the load method.
 					//
 					// N.B. See the comment above for the save method.
 					if generateSaverLoader {
-						fmt.Fprintf(outputFile, "func (x *%s) StateLoad(m %sSource) {\n", ts.Name.Name, statePrefix)
+						fmt.Fprintf(outputFile, "func (%s *%s) StateLoad(stateSourceObject %sSource) {\n", recv, ts.Name.Name, statePrefix)
 						scanFields(x, "", scanFunctions{normal: emitLoad, wait: emitLoadWait})
 						scanFields(x, "", scanFunctions{value: emitLoadValue})
 						if hasAfterLoad {
@@ -439,7 +433,7 @@ func main() {
 							// AfterLoad is called, the object encodes a dependency on
 							// referred objects (i.e. fields). This means that afterLoad
 							// will not be called until the other afterLoads are called.
-							fmt.Fprintf(outputFile, "	m.AfterLoad(x.afterLoad)\n")
+							fmt.Fprintf(outputFile, "	stateSourceObject.AfterLoad(%s.afterLoad)\n", recv)
 						}
 						fmt.Fprintf(outputFile, "}\n\n")
 					}
@@ -451,10 +445,10 @@ func main() {
 					maybeEmitImports()
 
 					// Generate the info methods.
-					fmt.Fprintf(outputFile, "func (x *%s) StateTypeName() string {\n", ts.Name.Name)
+					fmt.Fprintf(outputFile, "func (%s *%s) StateTypeName() string {\n", recv, ts.Name.Name)
 					fmt.Fprintf(outputFile, "	return \"%s.%s\"\n", *fullPkg, ts.Name.Name)
 					fmt.Fprintf(outputFile, "}\n\n")
-					fmt.Fprintf(outputFile, "func (x *%s) StateFields() []string {\n", ts.Name.Name)
+					fmt.Fprintf(outputFile, "func (%s *%s) StateFields() []string {\n", recv, ts.Name.Name)
 					fmt.Fprintf(outputFile, "	return nil\n")
 					fmt.Fprintf(outputFile, "}\n\n")
 
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
deleted file mode 100644
index 4ef1a3124..000000000
--- a/tools/issue_reviver/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-load("//tools:defs.bzl", "go_binary")
-
-package(licenses = ["notice"])
-
-go_binary(
-    name = "issue_reviver",
-    srcs = ["main.go"],
-    deps = [
-        "//tools/issue_reviver/github",
-        "//tools/issue_reviver/reviver",
-    ],
-)
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
deleted file mode 100644
index 0eabc2835..000000000
--- a/tools/issue_reviver/github/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "github",
-    srcs = ["github.go"],
-    nogo = False,
-    visibility = [
-        "//tools/issue_reviver:__subpackages__",
-    ],
-    deps = [
-        "//tools/issue_reviver/reviver",
-        "@com_github_google_go_github_v28//github:go_default_library",
-        "@org_golang_x_oauth2//:go_default_library",
-    ],
-)
-
-go_test(
-    name = "github_test",
-    size = "small",
-    srcs = ["github_test.go"],
-    library = ":github",
-)
diff --git a/tools/issue_reviver/main.go b/tools/issue_reviver/main.go
deleted file mode 100644
index 47c796b8a..000000000
--- a/tools/issue_reviver/main.go
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package main is the entry point for issue_reviver.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"strings"
-
-	"gvisor.dev/gvisor/tools/issue_reviver/github"
-	"gvisor.dev/gvisor/tools/issue_reviver/reviver"
-)
-
-var (
-	owner     string
-	repo      string
-	tokenFile string
-	path      string
-	dryRun    bool
-)
-
-// Keep the options simple for now. Supports only a single path and repo.
-func init() {
-	flag.StringVar(&owner, "owner", "", "Github project org/owner to look for issues")
-	flag.StringVar(&repo, "repo", "", "Github repo to look for issues")
-	flag.StringVar(&tokenFile, "oauth-token-file", "", "Path to file containing the OAUTH token to be used as credential to github")
-	flag.StringVar(&path, "path", ".", "Path to scan for TODOs")
-	flag.BoolVar(&dryRun, "dry-run", false, "If set to true, no changes are made to issues")
-}
-
-func main() {
-	// Set defaults from the environment.
-	repository := os.Getenv("GITHUB_REPOSITORY")
-	if parts := strings.SplitN(repository, "/", 2); len(parts) == 2 {
-		owner = parts[0]
-		repo = parts[1]
-	}
-
-	// Parse flags.
-	flag.Parse()
-
-	// Check for mandatory parameters.
-	if len(owner) == 0 {
-		fmt.Println("missing --owner option.")
-		flag.Usage()
-		os.Exit(1)
-	}
-	if len(repo) == 0 {
-		fmt.Println("missing --repo option.")
-		flag.Usage()
-		os.Exit(1)
-	}
-	if len(path) == 0 {
-		fmt.Println("missing --path option.")
-		flag.Usage()
-		os.Exit(1)
-	}
-
-	// The access token may be passed as a file so it doesn't show up in
-	// command line arguments. It also may be provided through the
-	// environment to faciliate use through GitHub's CI system.
-	token := os.Getenv("GITHUB_TOKEN")
-	if len(tokenFile) != 0 {
-		bytes, err := ioutil.ReadFile(tokenFile)
-		if err != nil {
-			fmt.Println(err.Error())
-			os.Exit(1)
-		}
-		token = string(bytes)
-	}
-
-	bugger, err := github.NewBugger(token, owner, repo, dryRun)
-	if err != nil {
-		fmt.Fprintln(os.Stderr, "Error getting github issues:", err)
-		os.Exit(1)
-	}
-	rev := reviver.New([]string{path}, []reviver.Bugger{bugger})
-	if errs := rev.Run(); len(errs) > 0 {
-		fmt.Fprintf(os.Stderr, "Encountered %d errors:\n", len(errs))
-		for _, err := range errs {
-			fmt.Fprintf(os.Stderr, "\t%v\n", err)
-		}
-		os.Exit(1)
-	}
-}
diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD
deleted file mode 100644
index d262932bd..000000000
--- a/tools/issue_reviver/reviver/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "reviver",
-    srcs = ["reviver.go"],
-    visibility = [
-        "//tools/issue_reviver:__subpackages__",
-    ],
-)
-
-go_test(
-    name = "reviver_test",
-    size = "small",
-    srcs = ["reviver_test.go"],
-    library = ":reviver",
-)
diff --git a/tools/make_apt.sh b/tools/make_apt.sh
index 3fb1066e5..13c5edd76 100755
--- a/tools/make_apt.sh
+++ b/tools/make_apt.sh
@@ -54,18 +54,22 @@ declare -r release="${root}/dists/${suite}"
 mkdir -p "${release}"
 
 # Create a temporary keyring, and ensure it is cleaned up.
+# Using separate homedir allows us to install apt repositories multiple times
+# using the same key. This is a limitation in GnuPG pre-2.1.
 declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
+declare -r homedir=$(mktemp -d /tmp/homedirXXXXXX)
+declare -r gpg_opts=("--no-default-keyring" "--secret-keyring" "${keyring}" "--homedir" "${homedir}")
 cleanup() {
-  rm -f "${keyring}"
+  rm -rf "${keyring}" "${homedir}"
 }
 trap cleanup EXIT
 
 # We attempt the import twice because the first one will fail if the public key
 # is not found. This isn't actually a failure for us, because we don't require
-# the public (this may be stored separately). The second import will succeed
+# the public key (this may be stored separately). The second import will succeed
 # because, in reality, the first import succeeded and it's a no-op.
-gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" || \
-  gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
+gpg "${gpg_opts[@]}" --import "${private_key}" || \
+  gpg "${gpg_opts[@]}" --import "${private_key}"
 
 # Copy the packages into the root.
 for pkg in "$@"; do
@@ -100,7 +104,8 @@ for pkg in "$@"; do
   cp -a "${pkg}" "${target}"
   chmod 0644 "${target}"
   if [[ "${ext}" == "deb" ]]; then
-    dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${target}"
+    # We use [*] here to expand the gpg_opts array into a single shell-word.
+    dpkg-sig -g "${gpg_opts[*]}" --sign builder "${target}"
   fi
 done
 
@@ -135,5 +140,5 @@ rm "${release}"/apt.conf
 # Sign the release.
 declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
 (cd "${release}" && rm -f Release.gpg InRelease)
-(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
-(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
+(cd "${release}" && gpg "${gpg_opts[@]}" --clearsign "${digest_opts[@]}" -o InRelease Release)
+(cd "${release}" && gpg "${gpg_opts[@]}" -abs "${digest_opts[@]}" -o Release.gpg Release)
diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD
index e1bfb9a2c..12b8b597c 100644
--- a/tools/nogo/BUILD
+++ b/tools/nogo/BUILD
@@ -1,22 +1,41 @@
-load("//tools:defs.bzl", "bzl_library", "go_library")
+load("//tools:defs.bzl", "bzl_library", "go_library", "select_goarch", "select_goos")
+load("//tools/nogo:defs.bzl", "nogo_objdump_tool", "nogo_stdlib", "nogo_target")
 
 package(licenses = ["notice"])
 
+nogo_target(
+    name = "target",
+    goarch = select_goarch(),
+    goos = select_goos(),
+    visibility = ["//visibility:public"],
+)
+
+nogo_objdump_tool(
+    name = "objdump_tool",
+    visibility = ["//visibility:public"],
+)
+
+nogo_stdlib(
+    name = "stdlib",
+    visibility = ["//visibility:public"],
+)
+
 go_library(
     name = "nogo",
     srcs = [
+        "analyzers.go",
         "build.go",
         "config.go",
-        "matchers.go",
+        "findings.go",
         "nogo.go",
-        "register.go",
     ],
     nogo = False,
     visibility = ["//:sandbox"],
     deps = [
         "//tools/checkescape",
         "//tools/checkunsafe",
-        "//tools/nogo/data",
+        "@co_honnef_go_tools//staticcheck:go_default_library",
+        "@co_honnef_go_tools//stylecheck:go_default_library",
         "@org_golang_x_tools//go/analysis:go_tool_library",
         "@org_golang_x_tools//go/analysis/internal/facts:go_tool_library",
         "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
diff --git a/tools/nogo/analyzers.go b/tools/nogo/analyzers.go
new file mode 100644
index 000000000..b919bc2f8
--- /dev/null
+++ b/tools/nogo/analyzers.go
@@ -0,0 +1,131 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"encoding/gob"
+
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/passes/asmdecl"
+	"golang.org/x/tools/go/analysis/passes/assign"
+	"golang.org/x/tools/go/analysis/passes/atomic"
+	"golang.org/x/tools/go/analysis/passes/bools"
+	"golang.org/x/tools/go/analysis/passes/buildtag"
+	"golang.org/x/tools/go/analysis/passes/cgocall"
+	"golang.org/x/tools/go/analysis/passes/composite"
+	"golang.org/x/tools/go/analysis/passes/copylock"
+	"golang.org/x/tools/go/analysis/passes/errorsas"
+	"golang.org/x/tools/go/analysis/passes/httpresponse"
+	"golang.org/x/tools/go/analysis/passes/loopclosure"
+	"golang.org/x/tools/go/analysis/passes/lostcancel"
+	"golang.org/x/tools/go/analysis/passes/nilfunc"
+	"golang.org/x/tools/go/analysis/passes/nilness"
+	"golang.org/x/tools/go/analysis/passes/printf"
+	"golang.org/x/tools/go/analysis/passes/shadow"
+	"golang.org/x/tools/go/analysis/passes/shift"
+	"golang.org/x/tools/go/analysis/passes/stdmethods"
+	"golang.org/x/tools/go/analysis/passes/stringintconv"
+	"golang.org/x/tools/go/analysis/passes/structtag"
+	"golang.org/x/tools/go/analysis/passes/tests"
+	"golang.org/x/tools/go/analysis/passes/unmarshal"
+	"golang.org/x/tools/go/analysis/passes/unreachable"
+	"golang.org/x/tools/go/analysis/passes/unsafeptr"
+	"golang.org/x/tools/go/analysis/passes/unusedresult"
+	"honnef.co/go/tools/staticcheck"
+	"honnef.co/go/tools/stylecheck"
+
+	"gvisor.dev/gvisor/tools/checkescape"
+	"gvisor.dev/gvisor/tools/checkunsafe"
+)
+
+// AllAnalyzers is a list of all available analyzers.
+var AllAnalyzers = []*analysis.Analyzer{
+	asmdecl.Analyzer,
+	assign.Analyzer,
+	atomic.Analyzer,
+	bools.Analyzer,
+	buildtag.Analyzer,
+	cgocall.Analyzer,
+	composite.Analyzer,
+	copylock.Analyzer,
+	errorsas.Analyzer,
+	httpresponse.Analyzer,
+	loopclosure.Analyzer,
+	lostcancel.Analyzer,
+	nilfunc.Analyzer,
+	nilness.Analyzer,
+	printf.Analyzer,
+	shift.Analyzer,
+	stdmethods.Analyzer,
+	stringintconv.Analyzer,
+	shadow.Analyzer,
+	structtag.Analyzer,
+	tests.Analyzer,
+	unmarshal.Analyzer,
+	unreachable.Analyzer,
+	unsafeptr.Analyzer,
+	unusedresult.Analyzer,
+	checkescape.Analyzer,
+	checkunsafe.Analyzer,
+}
+
+// EscapeAnalyzers is a list of escape-related analyzers.
+var EscapeAnalyzers = []*analysis.Analyzer{
+	checkescape.EscapeAnalyzer,
+}
+
+func register(all []*analysis.Analyzer) {
+	// Register all fact types.
+	//
+	// N.B. This needs to be done recursively, because there may be
+	// analyzers in the Requires list that do not appear explicitly above.
+	registered := make(map[*analysis.Analyzer]struct{})
+	var registerOne func(*analysis.Analyzer)
+	registerOne = func(a *analysis.Analyzer) {
+		if _, ok := registered[a]; ok {
+			return
+		}
+
+		// Register dependencies.
+		for _, da := range a.Requires {
+			registerOne(da)
+		}
+
+		// Register local facts.
+		for _, f := range a.FactTypes {
+			gob.Register(f)
+		}
+
+		registered[a] = struct{}{} // Done.
+	}
+	for _, a := range all {
+		registerOne(a)
+	}
+}
+
+func init() {
+	// Add all staticcheck analyzers.
+	for _, a := range staticcheck.Analyzers {
+		AllAnalyzers = append(AllAnalyzers, a)
+	}
+	// Add all stylecheck analyzers.
+	for _, a := range stylecheck.Analyzers {
+		AllAnalyzers = append(AllAnalyzers, a)
+	}
+
+	// Register lists.
+	register(AllAnalyzers)
+	register(EscapeAnalyzers)
+}
diff --git a/tools/nogo/build.go b/tools/nogo/build.go
index 433d13738..d173cff1f 100644
--- a/tools/nogo/build.go
+++ b/tools/nogo/build.go
@@ -20,21 +20,11 @@ import (
 	"os"
 )
 
-var (
-	// internalPrefix is the internal path prefix. Note that this is not
-	// special, as paths should be passed relative to the repository root
-	// and should not have any special prefix applied.
-	internalPrefix = fmt.Sprintf("^")
-
-	// externalPrefix is external workspace packages.
-	externalPrefix = "^external/"
-)
-
 // findStdPkg needs to find the bundled standard library packages.
-func (i *importer) findStdPkg(path string) (io.ReadCloser, error) {
+func findStdPkg(GOOS, GOARCH, path string) (io.ReadCloser, error) {
 	if path == "C" {
 		// Cgo builds cannot be analyzed. Skip.
 		return nil, ErrSkip
 	}
-	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", i.GOOS, i.GOARCH, path))
+	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", GOOS, GOARCH, path))
 }
diff --git a/tools/nogo/check/BUILD b/tools/nogo/check/BUILD
index e2d76cd5c..e18483a18 100644
--- a/tools/nogo/check/BUILD
+++ b/tools/nogo/check/BUILD
@@ -2,11 +2,10 @@ load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
-# Note that the check binary must be public, since an aspect may be applied
-# across lots of different rules in different repositories.
 go_binary(
     name = "check",
     srcs = ["main.go"],
+    nogo = False,
     visibility = ["//visibility:public"],
     deps = ["//tools/nogo"],
 )
diff --git a/tools/nogo/check/main.go b/tools/nogo/check/main.go
index 3828edf3a..69bdfe502 100644
--- a/tools/nogo/check/main.go
+++ b/tools/nogo/check/main.go
@@ -16,9 +16,99 @@
 package main
 
 import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+
 	"gvisor.dev/gvisor/tools/nogo"
 )
 
+var (
+	packageFile    = flag.String("package", "", "package configuration file (in JSON format)")
+	stdlibFile     = flag.String("stdlib", "", "stdlib configuration file (in JSON format)")
+	findingsOutput = flag.String("findings", "", "output file (or stdout, if not specified)")
+	factsOutput    = flag.String("facts", "", "output file for facts (optional)")
+	escapesOutput  = flag.String("escapes", "", "output file for escapes (optional)")
+)
+
+func loadConfig(file string, config interface{}) interface{} {
+	// Load the configuration.
+	f, err := os.Open(file)
+	if err != nil {
+		log.Fatalf("unable to open configuration %q: %v", file, err)
+	}
+	defer f.Close()
+	dec := json.NewDecoder(f)
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(config); err != nil {
+		log.Fatalf("unable to decode configuration: %v", err)
+	}
+	return config
+}
+
 func main() {
-	nogo.Main()
+	// Parse all flags.
+	flag.Parse()
+
+	var (
+		findings []nogo.Finding
+		factData []byte
+		err      error
+	)
+
+	// Check & load the configuration.
+	if *packageFile != "" && *stdlibFile != "" {
+		log.Fatalf("unable to perform stdlib and package analysis; provide only one!")
+	}
+
+	// Run the configuration.
+	if *stdlibFile != "" {
+		// Perform basic analysis.
+		c := loadConfig(*stdlibFile, new(nogo.StdlibConfig)).(*nogo.StdlibConfig)
+		findings, factData, err = nogo.CheckStdlib(c, nogo.AllAnalyzers)
+
+	} else if *packageFile != "" {
+		// Perform basic analysis.
+		c := loadConfig(*packageFile, new(nogo.PackageConfig)).(*nogo.PackageConfig)
+		findings, factData, err = nogo.CheckPackage(c, nogo.AllAnalyzers, nil)
+
+		// Do we need to do escape analysis?
+		if *escapesOutput != "" {
+			escapes, _, err := nogo.CheckPackage(c, nogo.EscapeAnalyzers, nil)
+			if err != nil {
+				log.Fatalf("error performing escape analysis: %v", err)
+			}
+			if err := nogo.WriteFindingsToFile(escapes, *escapesOutput); err != nil {
+				log.Fatalf("error writing escapes to %q: %v", *escapesOutput, err)
+			}
+		}
+	} else {
+		log.Fatalf("please provide at least one of package or stdlib!")
+	}
+
+	// Check that analysis was successful.
+	if err != nil {
+		log.Fatalf("error performing analysis: %v", err)
+	}
+
+	// Save facts.
+	if *factsOutput != "" {
+		if err := ioutil.WriteFile(*factsOutput, factData, 0644); err != nil {
+			log.Fatalf("error saving findings to %q: %v", *factsOutput, err)
+		}
+	}
+
+	// Write all findings.
+	if *findingsOutput != "" {
+		if err := nogo.WriteFindingsToFile(findings, *findingsOutput); err != nil {
+			log.Fatalf("error writing findings to %q: %v", *findingsOutput, err)
+		}
+	} else {
+		for _, finding := range findings {
+			fmt.Fprintf(os.Stdout, "%s\n", finding.String())
+		}
+	}
 }
diff --git a/tools/nogo/config.go b/tools/nogo/config.go
index 6958fca69..2fea5b3e1 100644
--- a/tools/nogo/config.go
+++ b/tools/nogo/config.go
@@ -15,102 +15,247 @@
 package nogo
 
 import (
-	"golang.org/x/tools/go/analysis"
-	"golang.org/x/tools/go/analysis/passes/asmdecl"
-	"golang.org/x/tools/go/analysis/passes/assign"
-	"golang.org/x/tools/go/analysis/passes/atomic"
-	"golang.org/x/tools/go/analysis/passes/bools"
-	"golang.org/x/tools/go/analysis/passes/buildtag"
-	"golang.org/x/tools/go/analysis/passes/cgocall"
-	"golang.org/x/tools/go/analysis/passes/composite"
-	"golang.org/x/tools/go/analysis/passes/copylock"
-	"golang.org/x/tools/go/analysis/passes/errorsas"
-	"golang.org/x/tools/go/analysis/passes/httpresponse"
-	"golang.org/x/tools/go/analysis/passes/loopclosure"
-	"golang.org/x/tools/go/analysis/passes/lostcancel"
-	"golang.org/x/tools/go/analysis/passes/nilfunc"
-	"golang.org/x/tools/go/analysis/passes/nilness"
-	"golang.org/x/tools/go/analysis/passes/printf"
-	"golang.org/x/tools/go/analysis/passes/shadow"
-	"golang.org/x/tools/go/analysis/passes/shift"
-	"golang.org/x/tools/go/analysis/passes/stdmethods"
-	"golang.org/x/tools/go/analysis/passes/stringintconv"
-	"golang.org/x/tools/go/analysis/passes/structtag"
-	"golang.org/x/tools/go/analysis/passes/tests"
-	"golang.org/x/tools/go/analysis/passes/unmarshal"
-	"golang.org/x/tools/go/analysis/passes/unreachable"
-	"golang.org/x/tools/go/analysis/passes/unsafeptr"
-	"golang.org/x/tools/go/analysis/passes/unusedresult"
-
-	"gvisor.dev/gvisor/tools/checkescape"
-	"gvisor.dev/gvisor/tools/checkunsafe"
+	"fmt"
+	"regexp"
 )
 
-var analyzerConfig = map[*analysis.Analyzer]matcher{
-	// Standard analyzers.
-	asmdecl.Analyzer: alwaysMatches(),
-	assign.Analyzer: externalExcluded(
-		".*gazelle/walk/walk.go", // False positive.
-	),
-	atomic.Analyzer:   alwaysMatches(),
-	bools.Analyzer:    alwaysMatches(),
-	buildtag.Analyzer: alwaysMatches(),
-	cgocall.Analyzer:  alwaysMatches(),
-	composite.Analyzer: and(
-		disableMatches(), // Disabled for now.
-		resultExcluded{
-			"Object_",
-			"Range{",
-		},
-	),
-	copylock.Analyzer:     internalMatches(), // Common external issues (e.g. protos).
-	errorsas.Analyzer:     alwaysMatches(),
-	httpresponse.Analyzer: alwaysMatches(),
-	loopclosure.Analyzer:  alwaysMatches(),
-	lostcancel.Analyzer:   internalMatches(), // Common external issues.
-	nilfunc.Analyzer:      alwaysMatches(),
-	nilness.Analyzer: and(
-		internalMatches(), // Common "tautological checks".
-		internalExcluded(
-			"pkg/sentry/platform/kvm/kvm_test.go", // Intentional.
-			"tools/bigquery/bigquery.go",          // False positive.
-		),
-	),
-	printf.Analyzer:     alwaysMatches(),
-	shift.Analyzer:      alwaysMatches(),
-	stdmethods.Analyzer: internalMatches(), // Common external issues (e.g. methods named "Write").
-	stringintconv.Analyzer: and(
-		internalExcluded(),
-		externalExcluded(
-			".*protobuf/.*.go",              // Bad conversions.
-			".*flate/huffman_bit_writer.go", // Bad conversion.
-		),
-	),
-	shadow.Analyzer:      disableMatches(),  // Disabled for now.
-	structtag.Analyzer:   internalMatches(), // External not subject to rules.
-	tests.Analyzer:       alwaysMatches(),
-	unmarshal.Analyzer:   alwaysMatches(),
-	unreachable.Analyzer: internalMatches(),
-	unsafeptr.Analyzer: and(
-		internalMatches(),
-		internalExcluded(
-			".*_test.go",                                               // Exclude tests.
-			"pkg/flipcall/.*_unsafe.go",                                // Special case.
-			"pkg/gohacks/gohacks_unsafe.go",                            // Special case.
-			"pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go",          // Special case.
-			"pkg/sentry/platform/kvm/bluepill_unsafe.go",               // Special case.
-			"pkg/sentry/platform/kvm/machine_unsafe.go",                // Special case.
-			"pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go", // Special case.
-			"pkg/sentry/platform/safecopy/safecopy_unsafe.go",          // Special case.
-			"pkg/sentry/vfs/mount_unsafe.go",                           // Special case.
-			"pkg/sentry/platform/systrap/stub_unsafe.go",               // Special case.
-			"pkg/sentry/platform/systrap/switchto_google_unsafe.go",    // Special case.
-			"pkg/sentry/platform/systrap/sysmsg_thread_unsafe.go",      // Special case.
-		),
-	),
-	unusedresult.Analyzer: alwaysMatches(),
-
-	// Internal analyzers: external packages not subject.
-	checkescape.Analyzer: internalMatches(),
-	checkunsafe.Analyzer: internalMatches(),
+// GroupName is a named group.
+type GroupName string
+
+// AnalyzerName is a named analyzer.
+type AnalyzerName string
+
+// Group represents a named collection of files.
+type Group struct {
+	// Name is the short name for the group.
+	Name GroupName `yaml:"name"`
+
+	// Regex matches all full paths in the group.
+	Regex string         `yaml:"regex"`
+	regex *regexp.Regexp `yaml:"-"`
+
+	// Default determines the default group behavior.
+	//
+	// If Default is true, all Analyzers are enabled for this
+	// group. Otherwise, Analyzers must be individually enabled
+	// by specifying a (possible empty) ItemConfig for the group
+	// in the AnalyzerConfig.
+	Default bool `yaml:"default"`
+}
+
+func (g *Group) compile() error {
+	r, err := regexp.Compile(g.Regex)
+	if err != nil {
+		return err
+	}
+	g.regex = r
+	return nil
+}
+
+// ItemConfig is an (Analyzer,Group) configuration.
+type ItemConfig struct {
+	// Exclude are analyzer exclusions.
+	//
+	// Exclude is a list of regular expressions. If the corresponding
+	// Analyzer emits a Finding for which Finding.Position.String()
+	// matches a regular expression in Exclude, the finding will not
+	// be reported.
+	Exclude []string         `yaml:"exclude,omitempty"`
+	exclude []*regexp.Regexp `yaml:"-"`
+
+	// Suppress are analyzer suppressions.
+	//
+	// Suppress is a list of regular expressions. If the corresponding
+	// Analyzer emits a Finding for which Finding.Message matches a regular
+	// expression in Suppress, the finding will not be reported.
+	Suppress []string         `yaml:"suppress,omitempty"`
+	suppress []*regexp.Regexp `yaml:"-"`
+}
+
+func compileRegexps(ss []string, rs *[]*regexp.Regexp) error {
+	*rs = make([]*regexp.Regexp, 0, len(ss))
+	for _, s := range ss {
+		r, err := regexp.Compile(s)
+		if err != nil {
+			return err
+		}
+		*rs = append(*rs, r)
+	}
+	return nil
+}
+
+func (i *ItemConfig) compile() error {
+	if i == nil {
+		// This may be nil if nothing is included in the
+		// item configuration. That's fine, there's nothing
+		// to compile and nothing to exclude & suppress.
+		return nil
+	}
+	if err := compileRegexps(i.Exclude, &i.exclude); err != nil {
+		return fmt.Errorf("in exclude: %w", err)
+	}
+	if err := compileRegexps(i.Suppress, &i.suppress); err != nil {
+		return fmt.Errorf("in suppress: %w", err)
+	}
+	return nil
+}
+
+func (i *ItemConfig) merge(other *ItemConfig) {
+	i.Exclude = append(i.Exclude, other.Exclude...)
+	i.Suppress = append(i.Suppress, other.Suppress...)
+}
+
+func (i *ItemConfig) shouldReport(fullPos, msg string) bool {
+	if i == nil {
+		// See above.
+		return true
+	}
+	for _, r := range i.exclude {
+		if r.MatchString(fullPos) {
+			return false
+		}
+	}
+	for _, r := range i.suppress {
+		if r.MatchString(msg) {
+			return false
+		}
+	}
+	return true
+}
+
+// AnalyzerConfig is the configuration for a single analyzers.
+//
+// This map is keyed by individual Group names, to allow for different
+// configurations depending on what Group the file belongs to.
+type AnalyzerConfig map[GroupName]*ItemConfig
+
+func (a AnalyzerConfig) compile() error {
+	for name, gc := range a {
+		if err := gc.compile(); err != nil {
+			return fmt.Errorf("invalid group %q: %v", name, err)
+		}
+	}
+	return nil
+}
+
+func (a AnalyzerConfig) merge(other AnalyzerConfig) {
+	// Merge all the groups.
+	for name, gc := range other {
+		old, ok := a[name]
+		if !ok || old == nil {
+			a[name] = gc // Not configured in a.
+			continue
+		}
+		old.merge(gc)
+	}
+}
+
+func (a AnalyzerConfig) shouldReport(groupConfig *Group, fullPos, msg string) bool {
+	gc, ok := a[groupConfig.Name]
+	if !ok {
+		return groupConfig.Default
+	}
+
+	// Note that if a section appears for a particular group
+	// for a particular analyzer, then it will now be enabled,
+	// and the group default no longer applies.
+	return gc.shouldReport(fullPos, msg)
+}
+
+// Config is a nogo configuration.
+type Config struct {
+	// Prefixes defines a set of regular expressions that
+	// are standard "prefixes", so that files can be grouped
+	// and specific rules applied to individual groups.
+	Groups []Group `yaml:"groups"`
+
+	// Global is the global analyzer config.
+	Global AnalyzerConfig `yaml:"global"`
+
+	// Analyzers are individual analyzer configurations. The
+	// key for each analyzer is the name of the analyzer. The
+	// value is either a boolean (enable/disable), or a map to
+	// the groups above.
+	Analyzers map[AnalyzerName]AnalyzerConfig `yaml:"analyzers"`
+}
+
+// Merge merges two configurations.
+func (c *Config) Merge(other *Config) {
+	// Merge all groups.
+	for _, g := range other.Groups {
+		// Is there a matching group? If yes, we just delete
+		// it. This will preserve the order provided in the
+		// overriding file, even if it differs.
+		for i := 0; i < len(c.Groups); i++ {
+			if g.Name == c.Groups[i].Name {
+				copy(c.Groups[i:], c.Groups[i+1:])
+				c.Groups = c.Groups[:len(c.Groups)-1]
+				break
+			}
+		}
+		c.Groups = append(c.Groups, g)
+	}
+
+	// Merge global configurations.
+	c.Global.merge(other.Global)
+
+	// Merge all analyzer configurations.
+	for name, ac := range other.Analyzers {
+		old, ok := c.Analyzers[name]
+		if !ok {
+			c.Analyzers[name] = ac // No analyzer in original config.
+			continue
+		}
+		old.merge(ac)
+	}
+}
+
+// Compile compiles a configuration to make it useable.
+func (c *Config) Compile() error {
+	for i := 0; i < len(c.Groups); i++ {
+		if err := c.Groups[i].compile(); err != nil {
+			return fmt.Errorf("invalid group %q: %w", c.Groups[i].Name, err)
+		}
+	}
+	if err := c.Global.compile(); err != nil {
+		return fmt.Errorf("invalid global: %w", err)
+	}
+	for name, ac := range c.Analyzers {
+		if err := ac.compile(); err != nil {
+			return fmt.Errorf("invalid analyzer %q: %w", name, err)
+		}
+	}
+	return nil
+}
+
+// ShouldReport returns true iff the finding should match the Config.
+func (c *Config) ShouldReport(finding Finding) bool {
+	fullPos := finding.Position.String()
+
+	// Find the matching group.
+	var groupConfig *Group
+	for i := 0; i < len(c.Groups); i++ {
+		if c.Groups[i].regex.MatchString(fullPos) {
+			groupConfig = &c.Groups[i]
+			break
+		}
+	}
+
+	// If there is no group matching this path, then
+	// we default to accept the finding.
+	if groupConfig == nil {
+		return true
+	}
+
+	// Suppress via global rule?
+	if !c.Global.shouldReport(groupConfig, fullPos, finding.Message) {
+		return false
+	}
+
+	// Try the analyzer config.
+	ac, ok := c.Analyzers[finding.Category]
+	if !ok {
+		return groupConfig.Default
+	}
+	return ac.shouldReport(groupConfig, fullPos, finding.Message)
 }
diff --git a/tools/nogo/data/BUILD b/tools/nogo/data/BUILD
deleted file mode 100644
index b7564cc44..000000000
--- a/tools/nogo/data/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "data",
-    srcs = ["data.go"],
-    nogo = False,
-    visibility = ["//tools:__subpackages__"],
-)
diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl
index d399079c5..b3d297308 100644
--- a/tools/nogo/defs.bzl
+++ b/tools/nogo/defs.bzl
@@ -1,6 +1,168 @@
 """Nogo rules."""
 
-load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
+load("//tools/bazeldefs:go.bzl", "go_context", "go_importpath", "go_rule", "go_test_library")
+
+NogoConfigInfo = provider(
+    "information about a nogo configuration",
+    fields = {
+        "srcs": "the collection of configuration files",
+    },
+)
+
+def _nogo_config_impl(ctx):
+    return [NogoConfigInfo(
+        srcs = ctx.files.srcs,
+    )]
+
+nogo_config = rule(
+    implementation = _nogo_config_impl,
+    attrs = {
+        "srcs": attr.label_list(
+            doc = "a list of yaml files (schema defined by tool/nogo/config.go).",
+            allow_files = True,
+        ),
+    },
+)
+
+NogoTargetInfo = provider(
+    "information about the Go target",
+    fields = {
+        "goarch": "the build architecture (GOARCH)",
+        "goos": "the build OS target (GOOS)",
+    },
+)
+
+def _nogo_target_impl(ctx):
+    return [NogoTargetInfo(
+        goarch = ctx.attr.goarch,
+        goos = ctx.attr.goos,
+    )]
+
+nogo_target = go_rule(
+    rule,
+    implementation = _nogo_target_impl,
+    attrs = {
+        "goarch": attr.string(
+            doc = "the Go build architecture (propagated to other rules).",
+            mandatory = True,
+        ),
+        "goos": attr.string(
+            doc = "the Go OS target (propagated to other rules).",
+            mandatory = True,
+        ),
+    },
+)
+
+def _nogo_objdump_tool_impl(ctx):
+    # Construct the magic dump command.
+    #
+    # Note that in some cases, the input is being fed into the tool via stdin.
+    # Unfortunately, the Go objdump tool expects to see a seekable file [1], so
+    # we need the tool to handle this case by creating a temporary file.
+    #
+    # [1] https://github.com/golang/go/issues/41051
+    nogo_target_info = ctx.attr._nogo_target[NogoTargetInfo]
+    go_ctx = go_context(ctx, goos = nogo_target_info.goos, goarch = nogo_target_info.goarch)
+    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()])
+    dumper = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(dumper, "\n".join([
+        "#!/bin/bash",
+        "set -euo pipefail",
+        "if [[ $# -eq 0 ]]; then",
+        " T=$(mktemp -u -t libXXXXXX.a)",
+        " cat /dev/stdin > ${T}",
+        "else",
+        " T=$1;",
+        "fi",
+        "%s %s tool objdump ${T}" % (
+            env_prefix,
+            go_ctx.go.path,
+        ),
+        "if [[ $# -eq 0 ]]; then",
+        " rm -rf ${T}",
+        "fi",
+        "",
+    ]), is_executable = True)
+
+    # Include the full runfiles.
+    return [DefaultInfo(
+        runfiles = ctx.runfiles(files = go_ctx.runfiles.to_list()),
+        executable = dumper,
+    )]
+
+nogo_objdump_tool = go_rule(
+    rule,
+    implementation = _nogo_objdump_tool_impl,
+    attrs = {
+        "_nogo_target": attr.label(
+            default = "//tools/nogo:target",
+            cfg = "target",
+        ),
+    },
+)
+
+# NogoStdlibInfo is the set of standard library facts.
+NogoStdlibInfo = provider(
+    "information for nogo analysis (standard library facts)",
+    fields = {
+        "facts": "serialized standard library facts",
+        "raw_findings": "raw package findings (if relevant)",
+    },
+)
+
+def _nogo_stdlib_impl(ctx):
+    # Build the standard library facts.
+    nogo_target_info = ctx.attr._nogo_target[NogoTargetInfo]
+    go_ctx = go_context(ctx, goos = nogo_target_info.goos, goarch = nogo_target_info.goarch)
+    facts = ctx.actions.declare_file(ctx.label.name + ".facts")
+    raw_findings = ctx.actions.declare_file(ctx.label.name + ".raw_findings")
+    config = struct(
+        Srcs = [f.path for f in go_ctx.stdlib_srcs],
+        GOOS = go_ctx.goos,
+        GOARCH = go_ctx.goarch,
+        Tags = go_ctx.tags,
+    )
+    config_file = ctx.actions.declare_file(ctx.label.name + ".cfg")
+    ctx.actions.write(config_file, config.to_json())
+    ctx.actions.run(
+        inputs = [config_file] + go_ctx.stdlib_srcs,
+        outputs = [facts, raw_findings],
+        tools = depset(go_ctx.runfiles.to_list() + ctx.files._nogo_objdump_tool),
+        executable = ctx.files._nogo_check[0],
+        mnemonic = "NogoStandardLibraryAnalysis",
+        progress_message = "Analyzing Go Standard Library",
+        arguments = go_ctx.nogo_args + [
+            "-objdump_tool=%s" % ctx.files._nogo_objdump_tool[0].path,
+            "-stdlib=%s" % config_file.path,
+            "-findings=%s" % raw_findings.path,
+            "-facts=%s" % facts.path,
+        ],
+    )
+
+    # Return the stdlib facts as output.
+    return [NogoStdlibInfo(
+        facts = facts,
+        raw_findings = raw_findings,
+    )]
+
+nogo_stdlib = go_rule(
+    rule,
+    implementation = _nogo_stdlib_impl,
+    attrs = {
+        "_nogo_check": attr.label(
+            default = "//tools/nogo/check:check",
+            cfg = "host",
+        ),
+        "_nogo_objdump_tool": attr.label(
+            default = "//tools/nogo:objdump_tool",
+            cfg = "host",
+        ),
+        "_nogo_target": attr.label(
+            default = "//tools/nogo:target",
+            cfg = "target",
+        ),
+    },
+)
 
 # NogoInfo is the serialized set of package facts for a nogo analysis.
 #
@@ -8,10 +170,15 @@ load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
 # with the source files as input. Note however, that the individual nogo rules
 # are simply stubs that enter into the shadow dependency tree (the "aspect").
 NogoInfo = provider(
+    "information for nogo analysis",
     fields = {
         "facts": "serialized package facts",
+        "raw_findings": "raw package findings (if relevant)",
+        "escapes": "escape-only findings (if relevant)",
         "importpath": "package import path",
         "binaries": "package binary files",
+        "srcs": "srcs (for go_test support)",
+        "deps": "deps (for go_test support)",
     },
 )
 
@@ -21,17 +188,26 @@ def _nogo_aspect_impl(target, ctx):
     # All work is done in the shadow properties for go rules. For a proto
     # library, we simply skip the analysis portion but still need to return a
     # valid NogoInfo to reference the generated binary.
-    if ctx.rule.kind == "go_library":
+    if ctx.rule.kind in ("go_library", "go_tool_library", "go_binary", "go_test"):
         srcs = ctx.rule.files.srcs
-    elif ctx.rule.kind == "go_proto_library" or ctx.rule.kind == "go_wrap_cc":
+        deps = ctx.rule.attr.deps
+    elif ctx.rule.kind in ("go_proto_library", "go_wrap_cc"):
         srcs = []
+        deps = ctx.rule.attr.deps
     else:
         return [NogoInfo()]
 
-    go_ctx = go_context(ctx)
-
-    # Construct the Go environment from the go_ctx.env dictionary.
-    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_ctx.env.items()])
+    # If we're using the "library" attribute, then we need to aggregate the
+    # original library sources and dependencies into this target to perform
+    # proper type analysis.
+    if ctx.rule.kind == "go_test":
+        library = go_test_library(ctx.rule)
+        if library != None:
+            info = library[NogoInfo]
+            if hasattr(info, "srcs"):
+                srcs = srcs + info.srcs
+            if hasattr(info, "deps"):
+                deps = deps + info.deps
 
     # Start with all target files and srcs as input.
     inputs = target.files.to_list() + srcs
@@ -41,50 +217,31 @@ def _nogo_aspect_impl(target, ctx):
     # to cleanly allow us redirect stdout to the actual output file. Perhaps
     # I'm missing something here, but the intermediate script does work.
     binaries = target.files.to_list()
-    disasm_file = ctx.actions.declare_file(target.label.name + ".out")
-    dumper = ctx.actions.declare_file("%s-dumper" % ctx.label.name)
-    ctx.actions.write(dumper, "\n".join([
-        "#!/bin/bash",
-        "%s %s tool objdump %s > %s\n" % (
-            env_prefix,
-            go_ctx.go.path,
-            [f.path for f in binaries if f.path.endswith(".a")][0],
-            disasm_file.path,
-        ),
-    ]), is_executable = True)
-    ctx.actions.run(
-        inputs = binaries,
-        outputs = [disasm_file],
-        tools = go_ctx.runfiles,
-        mnemonic = "GoObjdump",
-        progress_message = "Objdump %s" % target.label,
-        executable = dumper,
-    )
-    inputs.append(disasm_file)
+    objfiles = [f for f in binaries if f.path.endswith(".a")]
+    if len(objfiles) > 0:
+        # Prefer the .a files for go_library targets.
+        target_objfile = objfiles[0]
+    else:
+        # Use the raw binary for go_binary and go_test targets.
+        target_objfile = binaries[0]
+    inputs.append(target_objfile)
 
     # Extract the importpath for this package.
-    importpath = go_importpath(target)
-
-    # The nogo tool requires a configfile serialized in JSON format to do its
-    # work. This must line up with the nogo.Config fields.
-    facts = ctx.actions.declare_file(target.label.name + ".facts")
-    config = struct(
-        ImportPath = importpath,
-        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
-        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
-        # Google's internal build system needs a bit more help to find std.
-        StdZip = go_ctx.std_zip.short_path if hasattr(go_ctx, "std_zip") else "",
-        GOOS = go_ctx.goos,
-        GOARCH = go_ctx.goarch,
-        Tags = go_ctx.tags,
-        FactMap = {},  # Constructed below.
-        ImportMap = {},  # Constructed below.
-        FactOutput = facts.path,
-        Objdump = disasm_file.path,
-    )
+    if ctx.rule.kind == "go_test":
+        # If this is a test, then it will not be imported by anything else.
+        # We can safely set the importapth to just "test". Note that this
+        # is necessary if the library also imports the core library (in
+        # addition to including the sources directly), which happens in
+        # some complex cases (seccomp_victim).
+        importpath = "test"
+    else:
+        importpath = go_importpath(target)
 
     # Collect all info from shadow dependencies.
-    for dep in ctx.rule.attr.deps:
+    fact_map = dict()
+    import_map = dict()
+    all_raw_findings = []
+    for dep in deps:
         # There will be no file attribute set for all transitive dependencies
         # that are not go_library or go_binary rules, such as a proto rules.
         # This is handled by the ctx.rule.kind check above.
@@ -98,44 +255,101 @@ def _nogo_aspect_impl(target, ctx):
         x_files = [f.path for f in info.binaries if f.path.endswith(".x")]
         if not len(x_files):
             x_files = [f.path for f in info.binaries if f.path.endswith(".a")]
-        config.ImportMap[info.importpath] = x_files[0]
-        config.FactMap[info.importpath] = info.facts.path
+        import_map[info.importpath] = x_files[0]
+        fact_map[info.importpath] = info.facts.path
+
+        # Collect all findings; duplicates are resolved at the end.
+        all_raw_findings.extend(info.raw_findings)
 
         # Ensure the above are available as inputs.
         inputs.append(info.facts)
         inputs += info.binaries
 
-    # Write the configuration and run the tool.
+    # Add the standard library facts.
+    stdlib_info = ctx.attr._nogo_stdlib[NogoStdlibInfo]
+    stdlib_facts = stdlib_info.facts
+    inputs.append(stdlib_facts)
+
+    # The nogo tool operates on a configuration serialized in JSON format.
+    nogo_target_info = ctx.attr._nogo_target[NogoTargetInfo]
+    go_ctx = go_context(ctx, goos = nogo_target_info.goos, goarch = nogo_target_info.goarch)
+    facts = ctx.actions.declare_file(target.label.name + ".facts")
+    raw_findings = ctx.actions.declare_file(target.label.name + ".raw_findings")
+    escapes = ctx.actions.declare_file(target.label.name + ".escapes")
+    config = struct(
+        ImportPath = importpath,
+        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
+        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
+        GOOS = go_ctx.goos,
+        GOARCH = go_ctx.goarch,
+        Tags = go_ctx.tags,
+        FactMap = fact_map,
+        ImportMap = import_map,
+        StdlibFacts = stdlib_facts.path,
+    )
     config_file = ctx.actions.declare_file(target.label.name + ".cfg")
     ctx.actions.write(config_file, config.to_json())
     inputs.append(config_file)
-
-    # Run the nogo tool itself.
     ctx.actions.run(
         inputs = inputs,
-        outputs = [facts],
-        tools = go_ctx.runfiles,
-        executable = ctx.files._nogo[0],
-        mnemonic = "GoStaticAnalysis",
+        outputs = [facts, raw_findings, escapes],
+        tools = depset(go_ctx.runfiles.to_list() + ctx.files._nogo_objdump_tool),
+        executable = ctx.files._nogo_check[0],
+        mnemonic = "NogoAnalysis",
         progress_message = "Analyzing %s" % target.label,
-        arguments = ["-config=%s" % config_file.path],
+        arguments = go_ctx.nogo_args + [
+            "-binary=%s" % target_objfile.path,
+            "-objdump_tool=%s" % ctx.files._nogo_objdump_tool[0].path,
+            "-package=%s" % config_file.path,
+            "-findings=%s" % raw_findings.path,
+            "-facts=%s" % facts.path,
+            "-escapes=%s" % escapes.path,
+        ],
     )
 
+    # Flatten all findings from all dependencies.
+    #
+    # This is done because all the filtering must be done at the
+    # top-level nogo_test to dynamically apply a configuration.
+    # This does not actually add any additional work here, but
+    # will simply propagate the full list of files.
+    all_raw_findings = [stdlib_info.raw_findings] + depset(all_raw_findings).to_list() + [raw_findings]
+
     # Return the package facts as output.
     return [NogoInfo(
         facts = facts,
+        raw_findings = all_raw_findings,
+        escapes = escapes,
         importpath = importpath,
         binaries = binaries,
+        srcs = srcs,
+        deps = deps,
     )]
 
 nogo_aspect = go_rule(
     aspect,
     implementation = _nogo_aspect_impl,
-    attr_aspects = ["deps"],
+    attr_aspects = [
+        "deps",
+        "library",
+        "embed",
+    ],
     attrs = {
-        "_nogo": attr.label(
+        "_nogo_check": attr.label(
             default = "//tools/nogo/check:check",
-            allow_single_file = True,
+            cfg = "host",
+        ),
+        "_nogo_stdlib": attr.label(
+            default = "//tools/nogo:stdlib",
+            cfg = "host",
+        ),
+        "_nogo_objdump_tool": attr.label(
+            default = "//tools/nogo:objdump_tool",
+            cfg = "host",
+        ),
+        "_nogo_target": attr.label(
+            default = "//tools/nogo:target",
+            cfg = "target",
         ),
     },
 )
@@ -143,34 +357,72 @@ nogo_aspect = go_rule(
 def _nogo_test_impl(ctx):
     """Check nogo findings."""
 
-    # Build a runner that checks for the existence of the facts file. Note that
-    # the actual build will fail in the case of a broken analysis. We things
-    # this way so that any test applied is effectively pushed down to all
-    # upstream dependencies through the aspect.
-    inputs = []
-    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
-    runner_content = ["#!/bin/bash"]
-    for dep in ctx.attr.deps:
-        info = dep[NogoInfo]
-        inputs.append(info.facts)
+    # Ensure there's a single dependency.
+    if len(ctx.attr.deps) != 1:
+        fail("nogo_test requires exactly one dep.")
+    raw_findings = ctx.attr.deps[0][NogoInfo].raw_findings
+    escapes = ctx.attr.deps[0][NogoInfo].escapes
+
+    # Build a step that applies the configuration.
+    config_srcs = ctx.attr.config[NogoConfigInfo].srcs
+    findings = ctx.actions.declare_file(ctx.label.name + ".findings")
+    ctx.actions.run(
+        inputs = raw_findings + ctx.files.srcs + config_srcs,
+        outputs = [findings],
+        tools = depset(ctx.files._filter),
+        executable = ctx.files._filter[0],
+        mnemonic = "GoStaticAnalysis",
+        progress_message = "Generating %s" % ctx.label,
+        arguments = ["-input=%s" % f.path for f in raw_findings] +
+                    ["-config=%s" % f.path for f in config_srcs] +
+                    ["-output=%s" % findings.path],
+    )
 
-        # Draw a sweet unicode checkmark with the package name (in green).
-        runner_content.append("echo -e \"\\033[0;32m\\xE2\\x9C\\x94\\033[0;31m\\033[0m %s\"" % info.importpath)
-    runner_content.append("exit 0\n")
+    # Build a runner that checks the filtered facts.
+    #
+    # Note that this calls the filter binary without any configuration, so all
+    # findings will be included. But this is expected, since we've already
+    # filtered out everything that should not be included.
+    runner = ctx.actions.declare_file(ctx.label.name)
+    runner_content = [
+        "#!/bin/bash",
+        "exec %s -input=%s" % (ctx.files._filter[0].short_path, findings.short_path),
+        "",
+    ]
     ctx.actions.write(runner, "\n".join(runner_content), is_executable = True)
+
     return [DefaultInfo(
-        runfiles = ctx.runfiles(files = inputs),
+        # The runner just executes the filter again, on the
+        # newly generated filtered findings. We still need
+        # the filter tool as part of our runfiles, however.
+        runfiles = ctx.runfiles(files = ctx.files._filter + [findings]),
         executable = runner,
+    ), OutputGroupInfo(
+        # Propagate the filtered filters, for consumption by
+        # build tooling. Note that the build tooling typically
+        # pays attention to the mnemoic above, so this must be
+        # what is expected by the tooling.
+        nogo_findings = depset([findings]),
+        # Expose all escape analysis findings (see above).
+        nogo_escapes = depset([escapes]),
     )]
 
-_nogo_test = rule(
+nogo_test = rule(
     implementation = _nogo_test_impl,
     attrs = {
-        "deps": attr.label_list(aspects = [nogo_aspect]),
+        "config": attr.label(
+            mandatory = True,
+            doc = "A rule of kind nogo_config.",
+        ),
+        "deps": attr.label_list(
+            aspects = [nogo_aspect],
+            doc = "Exactly one Go dependency to be analyzed.",
+        ),
+        "srcs": attr.label_list(
+            allow_files = True,
+            doc = "Relevant src files. This is ignored except to make the nogo_test directly affected by the files.",
+        ),
+        "_filter": attr.label(default = "//tools/nogo/filter:filter"),
     },
     test = True,
 )
-
-def nogo_test(**kwargs):
-    tags = kwargs.pop("tags", []) + ["nogo"]
-    _nogo_test(tags = tags, **kwargs)
diff --git a/tools/nogo/filter/BUILD b/tools/nogo/filter/BUILD
new file mode 100644
index 000000000..e56a783e2
--- /dev/null
+++ b/tools/nogo/filter/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "filter",
+    srcs = ["main.go"],
+    nogo = False,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tools/nogo",
+        "@in_gopkg_yaml_v2//:go_default_library",
+    ],
+)
diff --git a/tools/nogo/filter/main.go b/tools/nogo/filter/main.go
new file mode 100644
index 000000000..9cf41b3b0
--- /dev/null
+++ b/tools/nogo/filter/main.go
@@ -0,0 +1,131 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary check is the nogo entrypoint.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"strings"
+
+	yaml "gopkg.in/yaml.v2"
+	"gvisor.dev/gvisor/tools/nogo"
+)
+
+type stringList []string
+
+func (s *stringList) String() string {
+	return strings.Join(*s, ",")
+}
+
+func (s *stringList) Set(value string) error {
+	*s = append(*s, value)
+	return nil
+}
+
+var (
+	inputFiles  stringList
+	configFiles stringList
+	outputFile  string
+	showConfig  bool
+)
+
+func init() {
+	flag.Var(&inputFiles, "input", "findings input files")
+	flag.StringVar(&outputFile, "output", "", "findings output file")
+	flag.Var(&configFiles, "config", "findings configuration files")
+	flag.BoolVar(&showConfig, "show-config", false, "dump configuration only")
+}
+
+func main() {
+	flag.Parse()
+
+	// Load all available findings.
+	var findings []nogo.Finding
+	for _, filename := range inputFiles {
+		inputFindings, err := nogo.ExtractFindingsFromFile(filename)
+		if err != nil {
+			log.Fatalf("unable to extract findings from %s: %v", filename, err)
+		}
+		findings = append(findings, inputFindings...)
+	}
+
+	// Open and merge all configuations.
+	config := &nogo.Config{
+		Global:    make(nogo.AnalyzerConfig),
+		Analyzers: make(map[nogo.AnalyzerName]nogo.AnalyzerConfig),
+	}
+	for _, filename := range configFiles {
+		content, err := ioutil.ReadFile(filename)
+		if err != nil {
+			log.Fatalf("unable to read %s: %v", filename, err)
+		}
+		var newConfig nogo.Config // For current file.
+		if err := yaml.Unmarshal(content, &newConfig); err != nil {
+			log.Fatalf("unable to decode %s: %v", filename, err)
+		}
+		config.Merge(&newConfig)
+		if showConfig {
+			bytes, err := yaml.Marshal(&newConfig)
+			if err != nil {
+				log.Fatalf("error marshalling config: %v", err)
+			}
+			mergedBytes, err := yaml.Marshal(config)
+			if err != nil {
+				log.Fatalf("error marshalling config: %v", err)
+			}
+			fmt.Fprintf(os.Stdout, "Loaded configuration from %s:\n%s\n", filename, string(bytes))
+			fmt.Fprintf(os.Stdout, "Merged configuration:\n%s\n", string(mergedBytes))
+		}
+	}
+	if err := config.Compile(); err != nil {
+		log.Fatalf("error compiling config: %v", err)
+	}
+	if showConfig {
+		os.Exit(0)
+	}
+
+	// Filter the findings (and aggregate by group).
+	filteredFindings := make([]nogo.Finding, 0, len(findings))
+	for _, finding := range findings {
+		if ok := config.ShouldReport(finding); ok {
+			filteredFindings = append(filteredFindings, finding)
+		}
+	}
+
+	// Write the output (if required).
+	//
+	// If the outputFile is specified, then we exit here. Otherwise,
+	// we continue to write to stdout and treat like a test.
+	if outputFile != "" {
+		if err := nogo.WriteFindingsToFile(filteredFindings, outputFile); err != nil {
+			log.Fatalf("unable to write findings: %v", err)
+		}
+		return
+	}
+
+	// Treat the run as a test.
+	if len(filteredFindings) == 0 {
+		fmt.Fprintf(os.Stdout, "PASS\n")
+		os.Exit(0)
+	}
+	for _, finding := range filteredFindings {
+		fmt.Fprintf(os.Stdout, "%s\n", finding.String())
+	}
+	os.Exit(1)
+}
diff --git a/tools/nogo/findings.go b/tools/nogo/findings.go
new file mode 100644
index 000000000..5bd850269
--- /dev/null
+++ b/tools/nogo/findings.go
@@ -0,0 +1,63 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"encoding/json"
+	"fmt"
+	"go/token"
+	"io/ioutil"
+)
+
+// Finding is a single finding.
+type Finding struct {
+	Category AnalyzerName
+	Position token.Position
+	Message  string
+}
+
+// String implements fmt.Stringer.String.
+func (f *Finding) String() string {
+	return fmt.Sprintf("%s: %s: %s", f.Category, f.Position.String(), f.Message)
+}
+
+// WriteFindingsToFile writes findings to a file.
+func WriteFindingsToFile(findings []Finding, filename string) error {
+	content, err := WriteFindingsToBytes(findings)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filename, content, 0644)
+}
+
+// WriteFindingsToBytes serializes findings as bytes.
+func WriteFindingsToBytes(findings []Finding) ([]byte, error) {
+	return json.Marshal(findings)
+}
+
+// ExtractFindingsFromFile loads findings from a file.
+func ExtractFindingsFromFile(filename string) ([]Finding, error) {
+	content, err := ioutil.ReadFile(filename)
+	if err != nil {
+		return nil, err
+	}
+	return ExtractFindingsFromBytes(content)
+}
+
+// ExtractFindingsFromBytes loads findings from bytes.
+func ExtractFindingsFromBytes(content []byte) (findings []Finding, err error) {
+	err = json.Unmarshal(content, &findings)
+	return findings, err
+}
diff --git a/tools/nogo/io_bazel_rules_go-visibility.patch b/tools/nogo/io_bazel_rules_go-visibility.patch
deleted file mode 100644
index 6b64b2e85..000000000
--- a/tools/nogo/io_bazel_rules_go-visibility.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-diff --git a/third_party/org_golang_x_tools-extras.patch b/third_party/org_golang_x_tools-extras.patch
-index 133fbccc..5f0d9a47 100644
---- a/third_party/org_golang_x_tools-extras.patch
-+++ b/third_party/org_golang_x_tools-extras.patch
-@@ -32,7 +32,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
-  
-  go_library(
-      name = "go_default_library",
--@@ -14,6 +14,23 @@
-+@@ -14,6 +14,20 @@
-      ],
-  )
-  
-@@ -43,10 +43,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
- +        "imports.go",
- +    ],
- +    importpath = "golang.org/x/tools/go/analysis/internal/facts",
--+    visibility = [
--+        "//go/analysis:__subpackages__",
--+        "@io_bazel_rules_go//go/tools/builders:__pkg__",
--+    ],
-++    visibility = ["//visibility:public"],
- +    deps = [
- +        "//go/analysis:go_tool_library",
- +        "//go/types/objectpath:go_tool_library",
diff --git a/tools/nogo/matchers.go b/tools/nogo/matchers.go
deleted file mode 100644
index 57a250501..000000000
--- a/tools/nogo/matchers.go
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package nogo
-
-import (
-	"go/token"
-	"path/filepath"
-	"regexp"
-	"strings"
-
-	"golang.org/x/tools/go/analysis"
-)
-
-type matcher interface {
-	ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool
-}
-
-// pathRegexps filters explicit paths.
-type pathRegexps struct {
-	expr []*regexp.Regexp
-
-	// include, if true, indicates that paths matching any regexp in expr
-	// match.
-	//
-	// If false, paths matching no regexps in expr match.
-	include bool
-}
-
-// buildRegexps builds a list of regular expressions.
-//
-// This will panic on error.
-func buildRegexps(prefix string, args ...string) []*regexp.Regexp {
-	result := make([]*regexp.Regexp, 0, len(args))
-	for _, arg := range args {
-		result = append(result, regexp.MustCompile(filepath.Join(prefix, arg)))
-	}
-	return result
-}
-
-// ShouldReport implements matcher.ShouldReport.
-func (p *pathRegexps) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
-	fullPos := fs.Position(d.Pos).String()
-	for _, path := range p.expr {
-		if path.MatchString(fullPos) {
-			return p.include
-		}
-	}
-	return !p.include
-}
-
-// internalExcluded excludes specific internal paths.
-func internalExcluded(paths ...string) *pathRegexps {
-	return &pathRegexps{
-		expr:    buildRegexps(internalPrefix, paths...),
-		include: false,
-	}
-}
-
-// excludedExcluded excludes specific external paths.
-func externalExcluded(paths ...string) *pathRegexps {
-	return &pathRegexps{
-		expr:    buildRegexps(externalPrefix, paths...),
-		include: false,
-	}
-}
-
-// internalMatches returns a path matcher for internal packages.
-func internalMatches() *pathRegexps {
-	return &pathRegexps{
-		expr:    buildRegexps(internalPrefix, ".*"),
-		include: true,
-	}
-}
-
-// resultExcluded excludes explicit message contents.
-type resultExcluded []string
-
-// ShouldReport implements matcher.ShouldReport.
-func (r resultExcluded) ShouldReport(d analysis.Diagnostic, _ *token.FileSet) bool {
-	for _, str := range r {
-		if strings.Contains(d.Message, str) {
-			return false
-		}
-	}
-	return true // Not excluded.
-}
-
-// andMatcher is a composite matcher.
-type andMatcher struct {
-	first  matcher
-	second matcher
-}
-
-// ShouldReport implements matcher.ShouldReport.
-func (a *andMatcher) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
-	return a.first.ShouldReport(d, fs) && a.second.ShouldReport(d, fs)
-}
-
-// and is a syntactic convension for andMatcher.
-func and(first matcher, second matcher) *andMatcher {
-	return &andMatcher{
-		first:  first,
-		second: second,
-	}
-}
-
-// anyMatcher matches everything.
-type anyMatcher struct{}
-
-// ShouldReport implements matcher.ShouldReport.
-func (anyMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
-	return true
-}
-
-// alwaysMatches returns an anyMatcher instance.
-func alwaysMatches() anyMatcher {
-	return anyMatcher{}
-}
-
-// neverMatcher will never match.
-type neverMatcher struct{}
-
-// ShouldReport implements matcher.ShouldReport.
-func (neverMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
-	return false
-}
-
-// disableMatches returns a neverMatcher instance.
-func disableMatches() neverMatcher {
-	return neverMatcher{}
-}
diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go
index ea1e97076..779d4d6d8 100644
--- a/tools/nogo/nogo.go
+++ b/tools/nogo/nogo.go
@@ -21,7 +21,6 @@ package nogo
 import (
 	"encoding/json"
 	"errors"
-	"flag"
 	"fmt"
 	"go/ast"
 	"go/build"
@@ -32,51 +31,89 @@ import (
 	"io/ioutil"
 	"log"
 	"os"
+	"path"
 	"path/filepath"
 	"reflect"
+	"strings"
 
 	"golang.org/x/tools/go/analysis"
 	"golang.org/x/tools/go/analysis/internal/facts"
 	"golang.org/x/tools/go/gcexportdata"
-	"gvisor.dev/gvisor/tools/nogo/data"
+
+	// Special case: flags live here and change overall behavior.
+	"gvisor.dev/gvisor/tools/checkescape"
 )
 
-// pkgConfig is serialized as the configuration.
+// StdlibConfig is serialized as the configuration.
 //
-// This contains everything required for the analysis.
-type pkgConfig struct {
-	ImportPath string
-	GoFiles    []string
-	NonGoFiles []string
-	Tags       []string
-	GOOS       string
-	GOARCH     string
-	ImportMap  map[string]string
-	FactMap    map[string]string
-	FactOutput string
-	Objdump    string
-	StdZip     string
+// This contains everything required for stdlib analysis.
+type StdlibConfig struct {
+	Srcs   []string
+	GOOS   string
+	GOARCH string
+	Tags   []string
 }
 
-// loadFacts finds and loads facts per FactMap.
-func (c *pkgConfig) loadFacts(path string) ([]byte, error) {
-	realPath, ok := c.FactMap[path]
-	if !ok {
-		return nil, nil // No facts available.
-	}
+// PackageConfig is serialized as the configuration.
+//
+// This contains everything required for single package analysis.
+type PackageConfig struct {
+	ImportPath  string
+	GoFiles     []string
+	NonGoFiles  []string
+	Tags        []string
+	GOOS        string
+	GOARCH      string
+	ImportMap   map[string]string
+	FactMap     map[string]string
+	StdlibFacts string
+}
 
-	// Read the files file.
-	data, err := ioutil.ReadFile(realPath)
-	if err != nil {
-		return nil, err
+// loader is a fact-loader function.
+type loader func(string) ([]byte, error)
+
+// saver is a fact-saver function.
+type saver func([]byte) error
+
+// factLoader returns a function that loads facts.
+//
+// This resolves all standard library facts and imported package facts up
+// front. The returned loader function will never return an error, only
+// empty facts.
+//
+// This is done because all stdlib data is stored together, and we don't want
+// to load this data many times over.
+func (c *PackageConfig) factLoader() (loader, error) {
+	allFacts := make(map[string][]byte)
+	if c.StdlibFacts != "" {
+		data, err := ioutil.ReadFile(c.StdlibFacts)
+		if err != nil {
+			return nil, fmt.Errorf("error loading stdlib facts from %q: %w", c.StdlibFacts, err)
+		}
+		var stdlibFacts map[string][]byte
+		if err := json.Unmarshal(data, &stdlibFacts); err != nil {
+			return nil, fmt.Errorf("error loading stdlib facts: %w", err)
+		}
+		for pkg, data := range stdlibFacts {
+			allFacts[pkg] = data
+		}
 	}
-	return data, nil
+	for pkg, file := range c.FactMap {
+		data, err := ioutil.ReadFile(file)
+		if err != nil {
+			return nil, fmt.Errorf("error loading %q: %w", file, err)
+		}
+		allFacts[pkg] = data
+	}
+	return func(path string) ([]byte, error) {
+		return allFacts[path], nil
+	}, nil
 }
 
 // shouldInclude indicates whether the file should be included.
 //
 // NOTE: This does only basic parsing of tags.
-func (c *pkgConfig) shouldInclude(path string) (bool, error) {
+func (c *PackageConfig) shouldInclude(path string) (bool, error) {
 	ctx := build.Default
 	ctx.GOOS = c.GOOS
 	ctx.GOARCH = c.GOARCH
@@ -90,10 +127,11 @@ func (c *pkgConfig) shouldInclude(path string) (bool, error) {
 // files, and the facts. Note that this importer implementation will always
 // pass when a given package is not available.
 type importer struct {
-	pkgConfig
-	fset    *token.FileSet
-	cache   map[string]*types.Package
-	lastErr error
+	*PackageConfig
+	fset     *token.FileSet
+	cache    map[string]*types.Package
+	lastErr  error
+	callback func(string) error
 }
 
 // Import implements types.Importer.Import.
@@ -104,6 +142,17 @@ func (i *importer) Import(path string) (*types.Package, error) {
 		// analyzers are specifically looking for this.
 		return types.Unsafe, nil
 	}
+
+	// Call the internal callback. This is used to resolve loading order
+	// for the standard library. See checkStdlib.
+	if i.callback != nil {
+		if err := i.callback(path); err != nil {
+			i.lastErr = err
+			return nil, err
+		}
+	}
+
+	// Actually load the data.
 	realPath, ok := i.ImportMap[path]
 	var (
 		rc  io.ReadCloser
@@ -112,7 +161,7 @@ func (i *importer) Import(path string) (*types.Package, error) {
 	if !ok {
 		// Not found in the import path. Attempt to find the package
 		// via the standard library.
-		rc, err = i.findStdPkg(path)
+		rc, err = findStdPkg(i.GOOS, i.GOARCH, path)
 	} else {
 		// Open the file.
 		rc, err = os.Open(realPath)
@@ -135,7 +184,165 @@ func (i *importer) Import(path string) (*types.Package, error) {
 // ErrSkip indicates the package should be skipped.
 var ErrSkip = errors.New("skipped")
 
-// checkPackage runs all analyzers.
+// CheckStdlib checks the standard library.
+//
+// This constructs a synthetic package configuration for each library in the
+// standard library sources, and call CheckPackage repeatedly.
+//
+// Note that not all parts of the source are expected to build. We skip obvious
+// test files, and cmd files, which should not be dependencies.
+func CheckStdlib(config *StdlibConfig, analyzers []*analysis.Analyzer) (allFindings []Finding, facts []byte, err error) {
+	if len(config.Srcs) == 0 {
+		return nil, nil, nil
+	}
+
+	// Ensure all paths are normalized.
+	for i := 0; i < len(config.Srcs); i++ {
+		config.Srcs[i] = path.Clean(config.Srcs[i])
+	}
+
+	// Calculate the root source directory. This is always a directory
+	// named 'src', of which we simply take the first we find. This is a
+	// bit fragile, but works for all currently known Go source
+	// configurations.
+	//
+	// Note that there may be extra files outside of the root source
+	// directory; we simply ignore those.
+	rootSrcPrefix := ""
+	for _, file := range config.Srcs {
+		const src = "/src/"
+		i := strings.Index(file, src)
+		if i == -1 {
+			// Superfluous file.
+			continue
+		}
+
+		// Index of first character after /src/.
+		i += len(src)
+		rootSrcPrefix = file[:i]
+		break
+	}
+
+	// Aggregate all files by directory.
+	packages := make(map[string]*PackageConfig)
+	for _, file := range config.Srcs {
+		if !strings.HasPrefix(file, rootSrcPrefix) {
+			// Superflouous file.
+			continue
+		}
+
+		d := path.Dir(file)
+		if len(rootSrcPrefix) >= len(d) {
+			continue // Not a file.
+		}
+		pkg := d[len(rootSrcPrefix):]
+		// Skip cmd packages and obvious test files: see above.
+		if strings.HasPrefix(pkg, "cmd/") || strings.HasSuffix(file, "_test.go") {
+			continue
+		}
+		c, ok := packages[pkg]
+		if !ok {
+			c = &PackageConfig{
+				ImportPath: pkg,
+				GOOS:       config.GOOS,
+				GOARCH:     config.GOARCH,
+				Tags:       config.Tags,
+			}
+			packages[pkg] = c
+		}
+		// Add the files appropriately. Note that they will be further
+		// filtered by architecture and build tags below, so this need
+		// not be done immediately.
+		if strings.HasSuffix(file, ".go") {
+			c.GoFiles = append(c.GoFiles, file)
+		} else {
+			c.NonGoFiles = append(c.NonGoFiles, file)
+		}
+	}
+
+	// Closure to check a single package.
+	stdlibFacts := make(map[string][]byte)
+	stdlibErrs := make(map[string]error)
+	var checkOne func(pkg string) error // Recursive.
+	checkOne = func(pkg string) error {
+		// Is this already done?
+		if _, ok := stdlibFacts[pkg]; ok {
+			return nil
+		}
+		// Did this fail previously?
+		if _, ok := stdlibErrs[pkg]; ok {
+			return nil
+		}
+
+		// Lookup the configuration.
+		config, ok := packages[pkg]
+		if !ok {
+			return nil // Not known.
+		}
+
+		// Find the binary package, and provide to objdump.
+		rc, err := findStdPkg(config.GOOS, config.GOARCH, pkg)
+		if err != nil {
+			// If there's no binary for this package, it is likely
+			// not built with the distribution. That's fine, we can
+			// just skip analysis.
+			stdlibErrs[pkg] = err
+			return nil
+		}
+
+		// Provide the input.
+		oldReader := checkescape.Reader
+		checkescape.Reader = rc // For analysis.
+		defer func() {
+			rc.Close()
+			checkescape.Reader = oldReader // Restore.
+		}()
+
+		// Run the analysis.
+		findings, factData, err := CheckPackage(config, analyzers, checkOne)
+		if err != nil {
+			// If we can't analyze a package from the standard library,
+			// then we skip it. It will simply not have any findings.
+			stdlibErrs[pkg] = err
+			return nil
+		}
+		stdlibFacts[pkg] = factData
+		allFindings = append(allFindings, findings...)
+		return nil
+	}
+
+	// Check all packages.
+	//
+	// Note that this may call checkOne recursively, so it's not guaranteed
+	// to evaluate in the order provided here. We do ensure however, that
+	// all packages are evaluated.
+	for pkg := range packages {
+		if err := checkOne(pkg); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	// Sanity check.
+	if len(stdlibFacts) == 0 {
+		return nil, nil, fmt.Errorf("no stdlib facts found: misconfiguration?")
+	}
+
+	// Write out all findings.
+	factData, err := json.Marshal(stdlibFacts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error saving stdlib facts: %w", err)
+	}
+
+	// Write out all errors.
+	for pkg, err := range stdlibErrs {
+		log.Printf("WARNING: error while processing %v: %v", pkg, err)
+	}
+
+	// Return all findings.
+	return allFindings, factData, nil
+}
+
+// CheckPackage runs all given analyzers.
 //
 // The implementation was adapted from [1], which was in turn adpated from [2].
 // This returns a list of matching analysis issues, or an error if the analysis
@@ -143,11 +350,12 @@ var ErrSkip = errors.New("skipped")
 //
 // [1] bazelbuid/rules_go/tools/builders/nogo_main.go
 // [2] golang.org/x/tools/go/checker/internal/checker
-func checkPackage(config pkgConfig) ([]string, error) {
+func CheckPackage(config *PackageConfig, analyzers []*analysis.Analyzer, importCallback func(string) error) (findings []Finding, factData []byte, err error) {
 	imp := &importer{
-		pkgConfig: config,
-		fset:      token.NewFileSet(),
-		cache:     make(map[string]*types.Package),
+		PackageConfig: config,
+		fset:          token.NewFileSet(),
+		cache:         make(map[string]*types.Package),
+		callback:      importCallback,
 	}
 
 	// Load all source files.
@@ -155,14 +363,14 @@ func checkPackage(config pkgConfig) ([]string, error) {
 	for _, file := range config.GoFiles {
 		include, err := config.shouldInclude(file)
 		if err != nil {
-			return nil, fmt.Errorf("error evaluating file %q: %v", file, err)
+			return nil, nil, fmt.Errorf("error evaluating file %q: %v", file, err)
 		}
 		if !include {
 			continue
 		}
 		s, err := parser.ParseFile(imp.fset, file, nil, parser.ParseComments)
 		if err != nil {
-			return nil, fmt.Errorf("error parsing file %q: %v", file, err)
+			return nil, nil, fmt.Errorf("error parsing file %q: %v", file, err)
 		}
 		syntax = append(syntax, s)
 	}
@@ -180,22 +388,22 @@ func checkPackage(config pkgConfig) ([]string, error) {
 	}
 	types, err := typeConfig.Check(config.ImportPath, imp.fset, syntax, typesInfo)
 	if err != nil && imp.lastErr != ErrSkip {
-		return nil, fmt.Errorf("error checking types: %w", err)
+		return nil, nil, fmt.Errorf("error checking types: %w", err)
 	}
 
 	// Load all package facts.
-	facts, err := facts.Decode(types, config.loadFacts)
+	loader, err := config.factLoader()
 	if err != nil {
-		return nil, fmt.Errorf("error decoding facts: %w", err)
+		return nil, nil, fmt.Errorf("error loading facts: %w", err)
+	}
+	facts, err := facts.Decode(types, loader)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error decoding facts: %w", err)
 	}
-
-	// Set the binary global for use.
-	data.Objdump = config.Objdump
 
 	// Register fact types and establish dependencies between analyzers.
 	// The visit closure will execute recursively, and populate results
 	// will all required analysis results.
-	diagnostics := make(map[*analysis.Analyzer][]analysis.Diagnostic)
 	results := make(map[*analysis.Analyzer]interface{})
 	var visit func(*analysis.Analyzer) error // For recursion.
 	visit = func(a *analysis.Analyzer) error {
@@ -210,27 +418,25 @@ func checkPackage(config pkgConfig) ([]string, error) {
 			}
 		}
 
-		// Prepare the matcher.
-		m := analyzerConfig[a]
-		report := func(d analysis.Diagnostic) {
-			if m.ShouldReport(d, imp.fset) {
-				diagnostics[a] = append(diagnostics[a], d)
-			}
-		}
-
 		// Run the analysis.
 		factFilter := make(map[reflect.Type]bool)
 		for _, f := range a.FactTypes {
 			factFilter[reflect.TypeOf(f)] = true
 		}
 		p := &analysis.Pass{
-			Analyzer:          a,
-			Fset:              imp.fset,
-			Files:             syntax,
-			Pkg:               types,
-			TypesInfo:         typesInfo,
-			ResultOf:          results, // All results.
-			Report:            report,
+			Analyzer:  a,
+			Fset:      imp.fset,
+			Files:     syntax,
+			Pkg:       types,
+			TypesInfo: typesInfo,
+			ResultOf:  results, // All results.
+			Report: func(d analysis.Diagnostic) {
+				findings = append(findings, Finding{
+					Category: AnalyzerName(a.Name),
+					Position: imp.fset.Position(d.Pos),
+					Message:  d.Message,
+				})
+			},
 			ImportPackageFact: facts.ImportPackageFact,
 			ExportPackageFact: facts.ExportPackageFact,
 			ImportObjectFact:  facts.ImportObjectFact,
@@ -252,75 +458,16 @@ func checkPackage(config pkgConfig) ([]string, error) {
 		return nil // Success.
 	}
 
-	// Visit all analysis recursively.
-	for a, _ := range analyzerConfig {
+	// Visit all analyzers recursively.
+	for _, a := range analyzers {
 		if imp.lastErr == ErrSkip {
 			continue // No local analysis.
 		}
 		if err := visit(a); err != nil {
-			return nil, err // Already has context.
-		}
-	}
-
-	// Write the output file.
-	if config.FactOutput != "" {
-		factData := facts.Encode()
-		if err := ioutil.WriteFile(config.FactOutput, factData, 0644); err != nil {
-			return nil, fmt.Errorf("error: unable to open facts output %q: %v", config.FactOutput, err)
-		}
-	}
-
-	// Convert all diagnostics to strings.
-	findings := make([]string, 0, len(diagnostics))
-	for a, ds := range diagnostics {
-		for _, d := range ds {
-			// Include the anlyzer name for debugability and configuration.
-			findings = append(findings, fmt.Sprintf("%s: %s: %s", a.Name, imp.fset.Position(d.Pos), d.Message))
+			return nil, nil, err // Already has context.
 		}
 	}
 
 	// Return all findings.
-	return findings, nil
-}
-
-var (
-	configFile = flag.String("config", "", "configuration file (in JSON format)")
-)
-
-// Main is the entrypoint; it should be called directly from main.
-//
-// N.B. This package registers it's own flags.
-func Main() {
-	// Parse all flags.
-	flag.Parse()
-
-	// Load the configuration.
-	f, err := os.Open(*configFile)
-	if err != nil {
-		log.Fatalf("unable to open configuration %q: %v", *configFile, err)
-	}
-	defer f.Close()
-	config := new(pkgConfig)
-	dec := json.NewDecoder(f)
-	dec.DisallowUnknownFields()
-	if err := dec.Decode(config); err != nil {
-		log.Fatalf("unable to decode configuration: %v", err)
-	}
-
-	// Process the package.
-	findings, err := checkPackage(*config)
-	if err != nil {
-		log.Fatalf("error checking package: %v", err)
-	}
-
-	// No findings?
-	if len(findings) == 0 {
-		os.Exit(0)
-	}
-
-	// Print findings and exit with non-zero code.
-	for _, finding := range findings {
-		fmt.Fprintf(os.Stdout, "%s\n", finding)
-	}
-	os.Exit(1)
+	return findings, facts.Encode(), nil
 }
diff --git a/tools/nogo/register.go b/tools/nogo/register.go
deleted file mode 100644
index 62b499661..000000000
--- a/tools/nogo/register.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package nogo
-
-import (
-	"encoding/gob"
-	"log"
-
-	"golang.org/x/tools/go/analysis"
-)
-
-// analyzers returns all configured analyzers.
-func analyzers() (all []*analysis.Analyzer) {
-	for a, _ := range analyzerConfig {
-		all = append(all, a)
-	}
-	return all
-}
-
-func init() {
-	// Validate basic configuration.
-	if err := analysis.Validate(analyzers()); err != nil {
-		log.Fatalf("unable to validate analyzer: %v", err)
-	}
-
-	// Register all fact types.
-	//
-	// N.B. This needs to be done recursively, because there may be
-	// analyzers in the Requires list that do not appear explicitly above.
-	registered := make(map[*analysis.Analyzer]struct{})
-	var register func(*analysis.Analyzer)
-	register = func(a *analysis.Analyzer) {
-		if _, ok := registered[a]; ok {
-			return
-		}
-
-		// Regsiter dependencies.
-		for _, da := range a.Requires {
-			register(da)
-		}
-
-		// Register local facts.
-		for _, f := range a.FactTypes {
-			gob.Register(f)
-		}
-
-		registered[a] = struct{}{} // Done.
-	}
-	for _, a := range analyzers() {
-		register(a)
-	}
-}
diff --git a/tools/parsers/BUILD b/tools/parsers/BUILD
new file mode 100644
index 000000000..dab954e25
--- /dev/null
+++ b/tools/parsers/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_binary", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "parsers_test",
+    size = "small",
+    srcs = ["go_parser_test.go"],
+    library = ":parsers",
+    nogo = False,
+    deps = [
+        "//tools/bigquery",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
+
+go_library(
+    name = "parsers",
+    testonly = 1,
+    srcs = [
+        "go_parser.go",
+    ],
+    nogo = False,
+    visibility = ["//:sandbox"],
+    deps = [
+        "//test/benchmarks/tools",
+        "//tools/bigquery",
+    ],
+)
+
+go_binary(
+    name = "parser",
+    testonly = 1,
+    srcs = ["parser_main.go"],
+    nogo = False,
+    deps = [
+        ":parsers",
+        "//runsc/flag",
+        "//tools/bigquery",
+    ],
+)
diff --git a/tools/parsers/go_parser.go b/tools/parsers/go_parser.go
new file mode 100644
index 000000000..df4875e6a
--- /dev/null
+++ b/tools/parsers/go_parser.go
@@ -0,0 +1,150 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package parsers holds parsers to parse Benchmark test output.
+//
+// Parsers parse Benchmark test output and place it in BigQuery
+// structs for sending to BigQuery databases.
+package parsers
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/test/benchmarks/tools"
+	"gvisor.dev/gvisor/tools/bigquery"
+)
+
+// ParseOutput expects golang benchmark output and returns a struct formatted
+// for BigQuery.
+func ParseOutput(output string, name string, official bool) (*bigquery.Suite, error) {
+	suite := bigquery.NewSuite(name)
+	lines := strings.Split(output, "\n")
+	for _, line := range lines {
+		bm, err := parseLine(line, official)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse line '%s': %v", line, err)
+		}
+		if bm != nil {
+			suite.Benchmarks = append(suite.Benchmarks, bm)
+		}
+	}
+	return suite, nil
+}
+
+// parseLine handles parsing a benchmark line into a bigquery.Benchmark.
+//
+// Example: "BenchmarkRuby/server_threads.1-6 1	1397875880 ns/op 140 requests_per_second.QPS"
+//
+// This function will return the following benchmark:
+// *bigquery.Benchmark{
+//	Name: BenchmarkRuby
+//  []*bigquery.Condition{
+//		{Name: GOMAXPROCS, 6}
+//		{Name: server_threads, 1}
+//  }
+//  []*bigquery.Metric{
+//		{Name: ns/op, Unit: ns/op, Sample: 1397875880}
+//		{Name: requests_per_second, Unit: QPS, Sample: 140 }
+//  }
+//}
+func parseLine(line string, official bool) (*bigquery.Benchmark, error) {
+	fields := strings.Fields(line)
+
+	// Check if this line is a Benchmark line. Otherwise ignore the line.
+	if len(fields) < 2 || !strings.HasPrefix(fields[0], "Benchmark") {
+		return nil, nil
+	}
+
+	iters, err := strconv.Atoi(fields[1])
+	if err != nil {
+		return nil, fmt.Errorf("expecting number of runs, got %s: %v", fields[1], err)
+	}
+
+	name, params, err := parseNameParams(fields[0])
+	if err != nil {
+		return nil, fmt.Errorf("parse name/params: %v", err)
+	}
+
+	bm := bigquery.NewBenchmark(name, iters, official)
+	for _, p := range params {
+		bm.AddCondition(p.Name, p.Value)
+	}
+
+	for i := 1; i < len(fields)/2; i++ {
+		value := fields[2*i]
+		metric := fields[2*i+1]
+		if err := makeMetric(bm, value, metric); err != nil {
+			return nil, fmt.Errorf("makeMetric on metric %q value: %s: %v", metric, value, err)
+		}
+	}
+	return bm, nil
+}
+
+// parseNameParams parses the Name, GOMAXPROCS, and Params from the test.
+// Field here should be of the format TESTNAME/PARAMS-GOMAXPROCS.
+// Parameters will be separated by a "/" with individual params being
+// "name.value".
+func parseNameParams(field string) (string, []*tools.Parameter, error) {
+	var params []*tools.Parameter
+	// Remove GOMAXPROCS from end.
+	maxIndex := strings.LastIndex(field, "-")
+	if maxIndex < 0 {
+		return "", nil, fmt.Errorf("GOMAXPROCS not found: %s", field)
+	}
+	maxProcs := field[maxIndex+1:]
+	params = append(params, &tools.Parameter{
+		Name:  "GOMAXPROCS",
+		Value: maxProcs,
+	})
+
+	remainder := field[0:maxIndex]
+	index := strings.Index(remainder, "/")
+	if index == -1 {
+		return remainder, params, nil
+	}
+
+	name := remainder[0:index]
+	p := remainder[index+1:]
+
+	ps, err := tools.NameToParameters(p)
+	if err != nil {
+		return "", nil, fmt.Errorf("NameToParameters %s: %v", field, err)
+	}
+	params = append(params, ps...)
+	return name, params, nil
+}
+
+// makeMetric parses metrics and adds them to the passed Benchmark.
+func makeMetric(bm *bigquery.Benchmark, value, metric string) error {
+	switch metric {
+	// Ignore most output from golang benchmarks.
+	case "MB/s", "B/op", "allocs/op":
+		return nil
+	case "ns/op":
+		val, err := strconv.ParseFloat(value, 64)
+		if err != nil {
+			return fmt.Errorf("ParseFloat %s: %v", value, err)
+		}
+		bm.AddMetric(metric /*metric name*/, metric /*unit*/, val /*sample*/)
+	default:
+		m, err := tools.ParseCustomMetric(value, metric)
+		if err != nil {
+			return fmt.Errorf("ParseCustomMetric %s: %v ", metric, err)
+		}
+		bm.AddMetric(m.Name, m.Unit, m.Sample)
+	}
+	return nil
+}
diff --git a/tools/parsers/go_parser_test.go b/tools/parsers/go_parser_test.go
new file mode 100644
index 000000000..0aa1152a2
--- /dev/null
+++ b/tools/parsers/go_parser_test.go
@@ -0,0 +1,169 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package parsers
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/tools/bigquery"
+)
+
+func TestParseLine(t *testing.T) {
+	testCases := []struct {
+		name string
+		data string
+		want *bigquery.Benchmark
+	}{
+		{
+			name: "Iperf",
+			data: "BenchmarkIperf/Upload-6 1	11094914892 ns/op	4751711232 bandwidth.bytes_per_second",
+			want: &bigquery.Benchmark{
+				Name: "BenchmarkIperf",
+				Condition: []*bigquery.Condition{
+					{
+						Name:  "GOMAXPROCS",
+						Value: "6",
+					},
+					{
+						Name:  "Upload",
+						Value: "Upload",
+					},
+				},
+				Metric: []*bigquery.Metric{
+					{
+						Name:   "ns/op",
+						Unit:   "ns/op",
+						Sample: 11094914892.0,
+					},
+					{
+						Name:   "bandwidth",
+						Unit:   "bytes_per_second",
+						Sample: 4751711232.0,
+					},
+				},
+			},
+		},
+		{
+			name: "Ruby",
+			data: "BenchmarkRuby/server_threads.1-6 1	1397875880 ns/op	0.00710 average_latency.s 140 requests_per_second.QPS",
+			want: &bigquery.Benchmark{
+				Name: "BenchmarkRuby",
+				Condition: []*bigquery.Condition{
+					{
+						Name:  "GOMAXPROCS",
+						Value: "6",
+					},
+					{
+						Name:  "server_threads",
+						Value: "1",
+					},
+				},
+				Metric: []*bigquery.Metric{
+					{
+						Name:   "ns/op",
+						Unit:   "ns/op",
+						Sample: 1397875880.0,
+					},
+					{
+						Name:   "average_latency",
+						Unit:   "s",
+						Sample: 0.00710,
+					},
+					{
+						Name:   "requests_per_second",
+						Unit:   "QPS",
+						Sample: 140.0,
+					},
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := parseLine(tc.data, false)
+			if err != nil {
+				t.Fatalf("parseLine failed with: %v", err)
+			}
+
+			if !cmp.Equal(tc.want, got, nil) {
+				for _, c := range got.Condition {
+					t.Logf("Cond: %+v", c)
+				}
+				for _, m := range got.Metric {
+					t.Logf("Metric: %+v", m)
+				}
+				t.Fatalf("Compare failed want: %+v got: %+v", tc.want, got)
+			}
+		})
+
+	}
+}
+
+func TestParseOutput(t *testing.T) {
+	testCases := []struct {
+		name          string
+		data          string
+		numBenchmarks int
+		numMetrics    int
+		numConditions int
+	}{
+		{
+			name: "Startup",
+			data: `
+				BenchmarkStartupEmpty
+				BenchmarkStartupEmpty-6                2         766377884 ns/op	1 allocs/op
+				BenchmarkStartupNode
+				BenchmarkStartupNode-6                 1        1752158409 ns/op	1 allocs/op
+			`,
+			numBenchmarks: 2,
+			numMetrics:    1,
+			numConditions: 1,
+		},
+		{
+			name: "Ruby",
+			data: `BenchmarkRuby
+BenchmarkRuby/server_threads.1
+BenchmarkRuby/server_threads.1-6 1	1397875880 ns/op 0.00710 average_latency.s 140 requests_per_second.QPS
+BenchmarkRuby/server_threads.5
+BenchmarkRuby/server_threads.5-6 1	1416003331 ns/op	0.00950 average_latency.s 465 requests_per_second.QPS`,
+			numBenchmarks: 2,
+			numMetrics:    3,
+			numConditions: 2,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			suite, err := ParseOutput(tc.data, "", false)
+			if err != nil {
+				t.Fatalf("parseOutput failed: %v", err)
+			} else if len(suite.Benchmarks) != tc.numBenchmarks {
+				t.Fatalf("NumBenchmarks failed want: %d got: %d %+v", tc.numBenchmarks, len(suite.Benchmarks), suite.Benchmarks)
+			}
+
+			for _, bm := range suite.Benchmarks {
+				if len(bm.Metric) != tc.numMetrics {
+					t.Fatalf("NumMetrics failed want: %d got: %d %+v", tc.numMetrics, len(bm.Metric), bm.Metric)
+				}
+
+				if len(bm.Condition) != tc.numConditions {
+					t.Fatalf("NumConditions failed want: %d got: %d %+v", tc.numConditions, len(bm.Condition), bm.Condition)
+				}
+			}
+		})
+	}
+}
diff --git a/tools/parsers/parser_main.go b/tools/parsers/parser_main.go
new file mode 100644
index 000000000..6c6182464
--- /dev/null
+++ b/tools/parsers/parser_main.go
@@ -0,0 +1,129 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary parser parses Benchmark data from golang benchmarks,
+// puts it into a Schema for BigQuery, and sends it to BigQuery.
+// parser will also initialize a table with the Benchmarks BigQuery schema.
+package main
+
+import (
+	"context"
+	"fmt"
+	"io/ioutil"
+	"os"
+
+	"gvisor.dev/gvisor/runsc/flag"
+	bq "gvisor.dev/gvisor/tools/bigquery"
+	"gvisor.dev/gvisor/tools/parsers"
+)
+
+const (
+	initString       = "init"
+	initDescription  = "initializes a new table with benchmarks schema"
+	parseString      = "parse"
+	parseDescription = "parses given benchmarks file and sends it to BigQuery table."
+)
+
+var (
+	// The init command will create a new dataset/table in the given project and initialize
+	// the table with the schema in //tools/bigquery/bigquery.go. If the table/dataset exists
+	// or has been initialized, init has no effect and successfully returns.
+	initCmd     = flag.NewFlagSet(initString, flag.ContinueOnError)
+	initProject = initCmd.String("project", "", "GCP project to send benchmarks.")
+	initDataset = initCmd.String("dataset", "", "dataset to send benchmarks data.")
+	initTable   = initCmd.String("table", "", "table to send benchmarks data.")
+
+	// The parse command parses benchmark data in `file` and sends it to the
+	// requested table.
+	parseCmd     = flag.NewFlagSet(parseString, flag.ContinueOnError)
+	file         = parseCmd.String("file", "", "file to parse for benchmarks")
+	name         = parseCmd.String("suite_name", "", "name of the benchmark suite")
+	clNumber     = parseCmd.String("cl", "", "changelist number of this run")
+	gitCommit    = parseCmd.String("git_commit", "", "git commit sha for this run")
+	parseProject = parseCmd.String("project", "", "GCP project to send benchmarks.")
+	parseDataset = parseCmd.String("dataset", "", "dataset to send benchmarks data.")
+	parseTable   = parseCmd.String("table", "", "table to send benchmarks data.")
+	official     = parseCmd.Bool("official", false, "mark input data as official.")
+)
+
+// initBenchmarks initializes a dataset/table in a BigQuery project.
+func initBenchmarks(ctx context.Context) error {
+	return bq.InitBigQuery(ctx, *initProject, *initDataset, *initTable, nil)
+}
+
+// parseBenchmarks parses the given file into the BigQuery schema,
+// adds some custom data for the commit, and sends the data to BigQuery.
+func parseBenchmarks(ctx context.Context) error {
+	data, err := ioutil.ReadFile(*file)
+	if err != nil {
+		return fmt.Errorf("failed to read file: %v", err)
+	}
+	suite, err := parsers.ParseOutput(string(data), *name, *official)
+	if err != nil {
+		return fmt.Errorf("failed parse data: %v", err)
+	}
+	extraConditions := []*bq.Condition{
+		{
+			Name:  "change_list",
+			Value: *clNumber,
+		},
+		{
+			Name:  "commit",
+			Value: *gitCommit,
+		},
+	}
+
+	suite.Conditions = append(suite.Conditions, extraConditions...)
+	return bq.SendBenchmarks(ctx, suite, *parseProject, *parseDataset, *parseTable, nil)
+}
+
+func main() {
+	ctx := context.Background()
+	switch {
+	// the "init" command
+	case len(os.Args) >= 2 && os.Args[1] == initString:
+		if err := initCmd.Parse(os.Args[2:]); err != nil {
+			fmt.Fprintf(os.Stderr, "failed parse flags: %v", err)
+			os.Exit(1)
+		}
+		if err := initBenchmarks(ctx); err != nil {
+			failure := "failed to initialize project: %s dataset: %s table: %s: %v"
+			fmt.Fprintf(os.Stderr, failure, *parseProject, *parseDataset, *parseTable, err)
+			os.Exit(1)
+		}
+	// the "parse" command.
+	case len(os.Args) >= 2 && os.Args[1] == parseString:
+		if err := parseCmd.Parse(os.Args[2:]); err != nil {
+			fmt.Fprintf(os.Stderr, "failed parse flags: %v", err)
+			os.Exit(1)
+		}
+		if err := parseBenchmarks(ctx); err != nil {
+			fmt.Fprintf(os.Stderr, "failed parse benchmarks: %v", err)
+			os.Exit(1)
+		}
+	default:
+		printUsage()
+	}
+}
+
+// printUsage prints the top level usage string.
+func printUsage() {
+	usage := `Usage: parser <command> <flags> ...
+
+Available commands:
+  %s     %s
+  %s     %s
+`
+	fmt.Fprintf(os.Stderr, usage, initCmd.Name(), initDescription, parseCmd.Name(), parseDescription)
+}
diff --git a/tools/rules_go.patch b/tools/rules_go.patch
new file mode 100644
index 000000000..5e1e87084
--- /dev/null
+++ b/tools/rules_go.patch
@@ -0,0 +1,14 @@
+diff --git a/go/private/rules/test.bzl b/go/private/rules/test.bzl
+index 17516ad7..76b6c68c 100644
+--- a/go/private/rules/test.bzl
++++ b/go/private/rules/test.bzl
+@@ -121,9 +121,6 @@ def _go_test_impl(ctx):
+     )
+ 
+     test_gc_linkopts = gc_linkopts(ctx)
+-    if not go.mode.debug:
+-        # Disable symbol table and DWARF generation for test binaries.
+-        test_gc_linkopts.extend(["-s", "-w"])
+ 
+     # Now compile the test binary itself
+     test_library = GoLibrary(
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index b0bab74b4..50378065e 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -43,7 +43,7 @@ fi
 
 closest_commit() {
   while read line; do
-    if [[ "$line" =~ "commit " ]]; then
+    if [[ "$line" =~ ^"commit " ]]; then
         current_commit="${line#commit }"
         continue
     elif [[ "$line" =~ "PiperOrigin-RevId: " ]]; then
@@ -57,7 +57,9 @@ closest_commit() {
 # Is the passed identifier a sha commit?
 if ! git show "${target_commit}" &> /dev/null; then
   # Extract the commit given a piper ID.
-  declare -r commit="$(git log | closest_commit "${target_commit}")"
+  commit="$(set +o pipefail; \
+    git log --first-parent | closest_commit "${target_commit}")"
+  declare -r commit
 else
   declare -r commit="${target_commit}"
 fi
diff --git a/website/BUILD b/website/BUILD
index 7b61d13c8..676c2b701 100644
--- a/website/BUILD
+++ b/website/BUILD
@@ -1,17 +1,20 @@
 load("//tools:defs.bzl", "bzl_library", "pkg_tar")
 load("//website:defs.bzl", "doc", "docs")
+load("//images:defs.bzl", "docker_image")
 
 package(licenses = ["notice"])
 
-# website is the full container image. Note that this actually just collects
-# other dependendcies and runs Docker locally to import and tag the image.
-sh_binary(
+docker_image(
     name = "website",
-    srcs = ["import.sh"],
-    data = [":files"],
+    data = ":files",
+    statements = [
+        "EXPOSE 8080/tcp",
+        'ENTRYPOINT ["/server"]',
+    ],
     tags = [
         "local",
         "manual",
+        "nosandbox",
     ],
 )
 
@@ -157,6 +160,7 @@ docs(
         "//g3doc/user_guide/quick_start:oci",
         "//g3doc/user_guide/tutorials:cni",
         "//g3doc/user_guide/tutorials:docker",
+        "//g3doc/user_guide/tutorials:docker_compose",
         "//g3doc/user_guide/tutorials:kubernetes",
     ],
 )
diff --git a/website/_config.yml b/website/_config.yml
index b08602970..51cb8e13c 100644
--- a/website/_config.yml
+++ b/website/_config.yml
@@ -34,3 +34,12 @@ authors:
   igudger:
     name: Ian Gudger
     email: igudger@google.com
+  fvoznika:
+    name: Fabricio Voznika
+    email: fvoznika@google.com
+  ianlewis:
+    name: Ian Lewis
+    email: ianlewis@google.com
+  mpratt:
+    name: Michael Pratt
+    email: mpratt@google.com
diff --git a/website/_sass/front.scss b/website/_sass/front.scss
index 0e4208f3c..f1b060560 100644
--- a/website/_sass/front.scss
+++ b/website/_sass/front.scss
@@ -1,5 +1,5 @@
 .jumbotron {
-  background-image: url(/assets/images/background.jpg);
+  background-image: url(/assets/images/background_1080p.jpg);
   background-position: center;
   background-repeat: no-repeat;
   background-size: cover;
diff --git a/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png
new file mode 100644
index 000000000..c750f0851
--- /dev/null
+++ b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png
diff --git a/website/assets/images/background_1080p.jpg b/website/assets/images/background_1080p.jpg
new file mode 100644
index 000000000..d312595a6
--- /dev/null
+++ b/website/assets/images/background_1080p.jpg
diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md
index 76bbabc13..b6cf57a77 100644
--- a/website/blog/2019-11-18-security-basics.md
+++ b/website/blog/2019-11-18-security-basics.md
@@ -188,7 +188,7 @@ for direct access to some files. And most files will be remotely accessed
 through the Gofers, in which case no FDs are donated to the Sentry.
 
 The Sentry itself is only allowed access to specific
-[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/boot/config.go).
+[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/config/config.go).
 Without networking, the Sentry needs 53 host syscalls in order to function, and
 with networking, it uses an additional 15[^8]. By limiting the whitelist to only
 these needed syscalls, we radically reduce the amount of host OS attack surface.
@@ -279,8 +279,10 @@ weaknesses of each gVisor component.
 We will also use it to introduce Google's Vulnerability Reward Program[^14], and
 other ways the community can contribute to help make gVisor safe, fast and
 stable.
+<br>
+<br>
 
-## Notes
+--------------------------------------------------------------------------------
 
 [^1]: [https://en.wikipedia.org/wiki/Secure_by_design](https://en.wikipedia.org/wiki/Secure_by_design)
 [^2]: [https://gvisor.dev/docs/architecture_guide](https://gvisor.dev/docs/architecture_guide/)
diff --git a/website/blog/2020-09-18-containing-a-real-vulnerability.md b/website/blog/2020-09-18-containing-a-real-vulnerability.md
new file mode 100644
index 000000000..8a6f7bbf1
--- /dev/null
+++ b/website/blog/2020-09-18-containing-a-real-vulnerability.md
@@ -0,0 +1,226 @@
+# Containing a Real Vulnerability
+
+In the previous two posts we talked about gVisor's
+[security design principles](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/)
+as well as how those are applied in the
+[context of networking](https://gvisor.dev/blog/2020/04/02/gvisor-networking-security/).
+Recently, a new container escape vulnerability
+([CVE-2020-14386](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-14386))
+was announced that ties these topics well together. gVisor is
+[not vulnerable](https://seclists.org/oss-sec/2020/q3/168) to this specific
+issue, but it provides an interesting case study to continue our exploration of
+gVisor's security. While gVisor is not immune to vulnerabilities,
+[we take several steps](https://gvisor.dev/security/) to minimize the impact and
+remediate if a vulnerability is found.
+
+## Escaping the Container
+
+First, let’s describe how the discovered vulnerability works. There are numerous
+ways one can send and receive bytes over the network with Linux. One of the most
+performant ways is to use a ring buffer, which is a memory region shared by the
+application and the kernel. These rings are created by calling
+[setsockopt(2)](https://man7.org/linux/man-pages/man2/setsockopt.2.html) with
+[`PACKET_RX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for
+receiving and
+[`PACKET_TX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for
+sending packets.
+
+The vulnerability is in the code that reads packets when `PACKET_RX_RING` is
+enabled. There is another option
+([`PACKET_RESERVE`](https://man7.org/linux/man-pages/man7/packet.7.html)) that
+asks the kernel to leave some space in the ring buffer before each packet for
+anything the application needs, e.g. control structures. When a packet is
+received, the kernel calculates where to copy the packet to, taking the amount
+reserved before each packet into consideration. If the amount reserved is large,
+the kernel performed an incorrect calculation which could cause an overflow
+leading to an out-of-bounds write of up to 10 bytes, controlled by the attacker.
+The data in the write is easily controlled using the loopback to send a crafted
+packet and receiving it using a `PACKET_RX_RING` with a carefully selected
+`PACKET_RESERVE` size.
+
+```c
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+               struct packet_type *pt, struct net_device *orig_dev)
+{
+// ...
+    if (sk->sk_type == SOCK_DGRAM) {
+        macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
+                  po->tp_reserve;
+    } else {
+        unsigned int maclen = skb_network_offset(skb);
+        // tp_reserve is unsigned int, netoff is unsigned short.
+        // Addition can overflow netoff
+        netoff = TPACKET_ALIGN(po->tp_hdrlen +
+                       (maclen < 16 ? 16 : maclen)) +
+                       po->tp_reserve;
+        if (po->has_vnet_hdr) {
+            netoff += sizeof(struct virtio_net_hdr);
+            do_vnet = true;
+        }
+        // Attacker controls netoff and can make macoff be smaller
+        // than sizeof(struct virtio_net_hdr)
+        macoff = netoff - maclen;
+    }
+// ...
+    // "macoff - sizeof(struct virtio_net_hdr)" can be negative,
+    // resulting in a pointer before h.raw
+    if (do_vnet &&
+        virtio_net_hdr_from_skb(skb, h.raw + macoff -
+                    sizeof(struct virtio_net_hdr),
+                    vio_le(), true, 0)) {
+// ...
+```
+
+The [`CAP_NET_RAW`](https://man7.org/linux/man-pages/man7/capabilities.7.html)
+capability is required to create the socket above. However, in order to support
+common debugging tools like `ping` and `tcpdump`, Docker containers, including
+those created for Kubernetes, are given `CAP_NET_RAW` by default and thus may be
+able to trigger this vulnerability to elevate privileges and escape the
+container.
+
+Next, we are going to explore why this vulnerability doesn’t work in gVisor, and
+how gVisor could prevent the escape even if a similar vulnerability existed
+inside gVisor’s kernel.
+
+## Default Protections
+
+gVisor does not implement `PACKET_RX_RING`, but **does** support raw sockets
+which are required for `PACKET_RX_RING`. Raw sockets are a controversial feature
+to support in a sandbox environment. While it allows great customizations for
+essential tools like `ping`, it may allow packets to be written to the network
+without any validation. In general, allowing an untrusted application to write
+crafted packets to the network is a questionable idea and a historical source of
+vulnerabilities. With that in mind, if `CAP_NET_RAW` is enabled by default, it
+would not be _secure by default_ to run untrusted applications.
+
+After multiple discussions when raw sockets were first implemented, we decided
+to disable raw sockets by default, **even if `CAP_NET_RAW` is given to the
+application**. Instead, enabling raw sockets in gVisor requires the admin to set
+`--net-raw` flag to runsc when configuring the runtime, in addition to requiring
+the `CAP_NET_RAW` capability in the application. It comes at the expense that
+some tools may not work out of the box, but as part of our
+[secure-by-default](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#secure-by-default)
+principle, we felt that it was important for the “less secure” configuration to
+be explicit.
+
+Since this bug was due to an overflow in the specific Linux implementation of
+the packet ring, gVisor's raw socket implementation is not affected. However, if
+there were a vulnerability in gVisor, containers would not be allowed to exploit
+it by default.
+
+As an alternative way to implement this same constraint, Kubernetes allows
+[admission controllers](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/)
+to be configured to customize requests. Cloud providers can use this to
+implement more stringent policies. For example, GKE implements an admission
+controller for gVisor that
+[removes `CAP_NET_RAW` from gVisor pods](https://cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#capabilities)
+unless it has been explicitly set in the pod spec.
+
+## Isolated Kernel
+
+gVisor has its own application kernel, called the Sentry, that is distinct from
+the host kernel. Just like what you would expect from a kernel, gVisor has a
+memory management subsystem, virtual file system, and a full network stack. The
+host network is only used as a transport to carry packets in and out the
+sandbox[^1]. The loopback interface which is used in the exploit stays
+completely inside the sandbox, never reaching the host.
+
+Therefore, even if the Sentry was vulnerable to the attack, there would be two
+factors that would prevent a container escape from happening. First, the
+vulnerability would be limited to the Sentry, and the attacker would compromise
+only the application kernel, bound by a restricted set of
+[seccomp](https://en.wikipedia.org/wiki/Seccomp) filters, discussed more in
+depth below. Second, the Sentry is a distinct implementation of the API, written
+in Go, which provides bounds checking that would have likely prevented access
+past the bounds of the shared region (e.g. see
+[aio](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/syscalls/linux/vfs2/aio.go;l=210;drc=a11061d78a58ed75b10606d1a770b035ed944b66?q=file:aio&ss=gvisor%2Fgvisor)
+or
+[kcov](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/kernel/kcov.go;l=272?q=file:kcov&ss=gvisor%2Fgvisor),
+which have similar shared regions).
+
+Here, Kubernetes warrants slightly more explanation. gVisor makes pods the unit
+of isolation and a pod can run multiple containers. In other words, each pod is
+a gVisor instance, and each container is a set of processes running inside
+gVisor, isolated via Sentry-internal namespaces like regular containers inside a
+pod. If there were a vulnerability in gVisor, the privilege escalation would
+allow a container inside the pod to break out to other **containers inside the
+same pod**, but the container still **cannot break out of the pod**.
+
+## Defense in Depth
+
+gVisor follows a
+[common security principle used at Google](https://cloud.google.com/security/infrastructure/design/resources/google_infrastructure_whitepaper_fa.pdf)
+that the system should have two layers of protection, and those layers should
+require different compromises to be broken. We apply this principle by assuming
+that the Sentry (first layer of defense)
+[will be compromised and should not be trusted](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#defense-in-depth).
+In order to protect the host kernel from a compromised Sentry, we wrap it around
+many security and isolations features to ensure only the minimal set of
+functionality from the host kernel is exposed.
+
+![Figure 1](/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png "Protection layers.")
+
+First, the sandbox runs inside a cgroup that can limit and throttle host
+resources being used. Second, the sandbox joins empty namespaces, including user
+and mount, to further isolate from the host. Next, it changes the process root
+to a read-only directory that contains only `/proc` and nothing else. Then, it
+executes with the unprivileged user/group
+[`nobody`](https://en.wikipedia.org/wiki/Nobody_\(username\)) with all
+capabilities stripped. Last and most importantly, a seccomp filter is added to
+tightly restrict what parts of the Linux syscall surface that gVisor is allowed
+to access. The allowed host surface is a far smaller set of syscalls than the
+Sentry implements for applications to use. Not only restricting the syscall
+being called, but also checking that arguments to these syscalls are within the
+expected set. Dangerous syscalls like <code>execve(2)</code>,
+<code>open(2)</code>, and <code>socket(2)</code> are prohibited, thus an
+attacker isn’t able to execute binaries or acquire new resources on the host.
+
+if there were a vulnerability in gVisor that allowed an attacker to execute code
+inside the Sentry, the attacker still has extremely limited privileges on the
+host. In fact, a compromised Sentry is much more restricted than a
+non-compromised regular container. For CVE-2020-14386 in particular, the attack
+would be blocked by more than one security layer: non-privileged user, no
+capability, and seccomp filters.
+
+Although the surface is drastically reduced, there is still a chance that there
+is a vulnerability in one of the allowed syscalls. That’s why it’s important to
+keep the surface small and carefully consider what syscalls are allowed. You can
+find the full set of allowed syscalls
+[here](https://cs.opensource.google/gvisor/gvisor/+/master:runsc/boot/filter/).
+
+Another possible attack vector is resources that are present in the Sentry, like
+open file descriptors. The Sentry has file descriptors that an attacker could
+potentially use, such as log files, platform files (e.g. `/dev/kvm`), an RPC
+endpoint that allows external communication with the Sentry, and a Netstack
+endpoint that connects the sandbox to the network. The Netstack endpoint in
+particular is a concern because it gives direct access to the network. It’s an
+`AF_PACKET` socket that allows arbitrary L2 packets to be written to the
+network. In the normal case, Netstack assembles packets that go out the network,
+giving the container control over only the payload. But if the Sentry is
+compromised, an attacker can craft packets to the network. In many ways this is
+similar to anyone sending random packets over the internet, but still this is a
+place where the host kernel surface exposed is larger than we would like it to
+be.
+
+## Conclusion
+
+Security comes with many tradeoffs that are often hard to make, such as the
+decision to disable raw sockets by default. However, these tradeoffs have served
+us well, and we've found them to have paid off over time. CVE-2020-14386 offers
+great insight into how multiple layers of protection can be effective against
+such an attack.
+
+We cannot guarantee that a container escape will never happen in gVisor, but we
+do our best to make it as hard as we possibly can.
+
+If you have not tried gVisor yet, it’s easier than you think. Just follow the
+steps [here](https://gvisor.dev/docs/user_guide/install/).
+<br>
+<br>
+
+--------------------------------------------------------------------------------
+
+[^1]: Those packets are eventually handled by the host, as it needs to route
+    them to local containers or send them out the NIC. The packet will be
+    handled by many switches, routers, proxies, servers, etc. along the way,
+    which may be subject to their own vulnerabilities.
diff --git a/website/blog/2020-10-22-platform-portability.md b/website/blog/2020-10-22-platform-portability.md
new file mode 100644
index 000000000..4d82940f9
--- /dev/null
+++ b/website/blog/2020-10-22-platform-portability.md
@@ -0,0 +1,120 @@
+# Platform Portability
+
+Hardware virtualization is often seen as a requirement to provide an additional
+isolation layer for untrusted applications. However, hardware virtualization
+requires expensive bare-metal machines or cloud instances to run safely with
+good performance, increasing cost and complexity for Cloud users. gVisor,
+however, takes a more flexible approach.
+
+One of the pillars of gVisor's architecture is portability, allowing it to run
+anywhere that runs Linux. Modern Cloud-Native applications run in containers in
+many different places, from bare metal to virtual machines, and can't always
+rely on nested virtualization. It is important for gVisor to be able to support
+the environments where you run containers.
+
+gVisor achieves portability through an abstraction called a _Platform_.
+Platforms can have many implementations, and each implementation can cover
+different environments, making use of available software or hardware features.
+
+## Background
+
+Before we can understand how gVisor achieves portability using platforms, we
+should take a step back and understand how applications interact with their
+host.
+
+Container sandboxes can provide an isolation layer between the host and
+application by virtualizing one of the layers below it, including the hardware
+or operating system. Many sandboxes virtualize the hardware layer by running
+applications in virtual machines. gVisor takes a different approach by
+virtualizing the OS layer.
+
+When an application is run in a normal situation the host operating system loads
+the application into user memory and schedules it for execution. The operating
+system scheduler eventually schedules the application to a CPU and begins
+executing it. It then handles the application's requests, such as for memory and
+the lifecycle of the application. gVisor virtualizes these interactions, such as
+system calls, and context switching that happen between an application and OS.
+
+[System calls](https://en.wikipedia.org/wiki/System_call) allow applications to
+ask the OS to perform some task for it. System calls look like a normal function
+call in most programming languages though works a bit differently under the
+hood. When an application system call is encountered some special processing
+takes place to do a
+[context switch](https://en.wikipedia.org/wiki/Context_switch) into kernel mode
+and begin executing code in the kernel before returning a result to the
+application. Context switching may happen in other situations as well. For
+example, to respond to an interrupt.
+
+## The Platform Interface
+
+gVisor provides a sandbox which implements the Linux OS interface, intercepting
+OS interactions such as system calls and implements them in the sandbox kernel.
+
+It does this to limit interactions with the host, and protect the host from an
+untrusted application running in the sandbox. The Platform is the bottom layer
+of gVisor which provides the environment necessary for gVisor to control and
+manage applications. In general, the Platform must:
+
+1.  Provide the ability to create and manage memory address spaces.
+2.  Provide execution contexts for running applications in those memory address
+    spaces.
+3.  Provide the ability to change execution context and return control to gVisor
+    at specific times (e.g. system call, page fault)
+
+This interface is conceptually simple, but very powerful. Since the Platform
+interface only requires these three capabilities, it gives gVisor enough control
+for it to act as the application's OS, while still allowing the use of very
+different isolation technologies under the hood. You can learn more about the
+Platform interface in the
+[Platform Guide](https://gvisor.dev/docs/architecture_guide/platforms/).
+
+## Implementations of the Platform Interface
+
+While gVisor can make use of technologies like hardware virtualization, it
+doesn't necessarily rely on any one technology to provide a similar level of
+isolation. The flexibility of the Platform interface allows for implementations
+that use technologies other than hardware virtualization. This allows gVisor to
+run in VMs without nested virtualization, for example. By providing an
+abstraction for the underlying platform, each implementation can make various
+tradeoffs regarding performance or hardware requirements.
+
+Currently gVisor provides two gVisor Platform implementations; the Ptrace
+Platform, and the KVM Platform, each using very different methods to implement
+the Platform interface.
+
+![gVisor Platforms](../../../../../docs/architecture_guide/platforms/platforms.png "Platforms")
+
+The Ptrace Platform uses
+[PTRACE\_SYSEMU](http://man7.org/linux/man-pages/man2/ptrace.2.html) to trap
+syscalls, and uses the host for memory mapping and context switching. This
+platform can run anywhere that ptrace is available, which includes most Linux
+systems, VMs or otherwise.
+
+The KVM Platform uses virtualization, but in an unconventional way. gVisor runs
+in a virtual machine but as both guest OS and VMM, and presents no virtualized
+hardware layer. This provides a simpler interface that can avoid hardware
+initialization for fast start up, while taking advantage of hardware
+virtualization support to improve memory isolation and performance of context
+switching.
+
+The flexibility of the Platform interface allows for a lot of room to improve
+the existing KVM and ptrace platforms, as well as the ability to utilize new
+methods for improving gVisor's performance or portability in future Platform
+implementations.
+
+## Portability
+
+Through the Platform interface, gVisor is able to support bare metal, virtual
+machines, and Cloud environments while still providing a highly secure sandbox
+for running untrusted applications. This is especially important for Cloud and
+Kubernetes users because it allows gVisor to run anywhere that Kubernetes can
+run and provide similar experiences in multi-region, hybrid, multi-platform
+environments.
+
+Give gVisor's open source platforms a try. Using a Platform is as easy as
+providing the `--platform` flag to `runsc`. See the documentation on
+[changing platforms](https://gvisor.dev/docs/user_guide/platforms/) for how to
+use different platforms with Docker. We would love to hear about your experience
+so come chat with us in our
+[Gitter channel](https://gitter.im/gvisor/community), or send us an
+[issue on Github](https://gvisor.dev/issue) if you run into any problems.
diff --git a/website/blog/BUILD b/website/blog/BUILD
index 01c1f5a6e..17beb721f 100644
--- a/website/blog/BUILD
+++ b/website/blog/BUILD
@@ -28,6 +28,27 @@ doc(
     permalink = "/blog/2020/04/02/gvisor-networking-security/",
 )
 
+doc(
+    name = "containing_a_real_vulnerability",
+    src = "2020-09-18-containing-a-real-vulnerability.md",
+    authors = [
+        "fvoznika",
+    ],
+    layout = "post",
+    permalink = "/blog/2020/09/18/containing-a-real-vulnerability/",
+)
+
+doc(
+    name = "platform_portability",
+    src = "2020-10-22-platform-portability.md",
+    authors = [
+        "ianlewis",
+        "mpratt",
+    ],
+    layout = "post",
+    permalink = "/blog/2020/10/22/platform-portability/",
+)
+
 docs(
     name = "posts",
     deps = [
diff --git a/website/cmd/server/main.go b/website/cmd/server/main.go
index c401b6abd..ac09550a9 100644
--- a/website/cmd/server/main.go
+++ b/website/cmd/server/main.go
@@ -29,6 +29,7 @@ var redirects = map[string]string{
 	// GitHub redirects.
 	"/change":    "https://github.com/google/gvisor",
 	"/issue":     "https://github.com/google/gvisor/issues",
+	"/issues":    "https://github.com/google/gvisor/issues",
 	"/issue/new": "https://github.com/google/gvisor/issues/new",
 	"/pr":        "https://github.com/google/gvisor/pulls",
 
@@ -44,14 +45,16 @@ var redirects = map[string]string{
 	"/c/linux/amd64": "/docs/user_guide/compatibility/linux/amd64/",
 
 	// Redirect for old URLs.
-	"/docs/user_guide/compatibility/amd64/": "/docs/user_guide/compatibility/linux/amd64/",
-	"/docs/user_guide/compatibility/amd64":  "/docs/user_guide/compatibility/linux/amd64/",
-	"/docs/user_guide/kubernetes/":          "/docs/user_guide/quick_start/kubernetes/",
-	"/docs/user_guide/kubernetes":           "/docs/user_guide/quick_start/kubernetes/",
-	"/docs/user_guide/oci/":                 "/docs/user_guide/quick_start/oci/",
-	"/docs/user_guide/oci":                  "/docs/user_guide/quick_start/oci/",
-	"/docs/user_guide/docker/":              "/docs/user_guide/quick_start/docker/",
-	"/docs/user_guide/docker":               "/docs/user_guide/quick_start/docker/",
+	"/docs/user_guide/compatibility/amd64/":  "/docs/user_guide/compatibility/linux/amd64/",
+	"/docs/user_guide/compatibility/amd64":   "/docs/user_guide/compatibility/linux/amd64/",
+	"/docs/user_guide/kubernetes/":           "/docs/user_guide/quick_start/kubernetes/",
+	"/docs/user_guide/kubernetes":            "/docs/user_guide/quick_start/kubernetes/",
+	"/docs/user_guide/oci/":                  "/docs/user_guide/quick_start/oci/",
+	"/docs/user_guide/oci":                   "/docs/user_guide/quick_start/oci/",
+	"/docs/user_guide/docker/":               "/docs/user_guide/quick_start/docker/",
+	"/docs/user_guide/docker":                "/docs/user_guide/quick_start/docker/",
+	"/blog/2020/09/22/platform-portability":  "/blog/2020/10/22/platform-portability/",
+	"/blog/2020/09/22/platform-portability/": "/blog/2020/10/22/platform-portability/",
 
 	// Deprecated, but links continue to work.
 	"/cl": "https://gvisor-review.googlesource.com",
@@ -60,6 +63,7 @@ var redirects = map[string]string{
 var prefixHelpers = map[string]string{
 	"change": "https://github.com/google/gvisor/commit/%s",
 	"issue":  "https://github.com/google/gvisor/issues/%s",
+	"issues": "https://github.com/google/gvisor/issues/%s",
 	"pr":     "https://github.com/google/gvisor/pull/%s",
 
 	// Redirects to compatibility docs.
diff --git a/website/css/main.scss b/website/css/main.scss
index 06106833f..4b3b7b500 100644
--- a/website/css/main.scss
+++ b/website/css/main.scss
@@ -1,5 +1,10 @@
-@import 'style.scss';
-@import 'front.scss';
-@import 'navbar.scss';
-@import 'sidebar.scss';
-@import 'footer.scss';
+// The main style sheet for gvisor.dev
+
+// NOTE: Do not include file extensions to import .sass and .css files seamlessly.
+@import "style";
+@import "front";
+@import "navbar";
+@import "sidebar";
+@import "footer";
+// syntax is generated by rougify.
+@import "syntax";
diff --git a/website/import.sh b/website/import.sh
deleted file mode 100755
index e1350e83d..000000000
--- a/website/import.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -xeuo pipefail
-
-if [[ -d $0.runfiles ]]; then
-  cd $0.runfiles
-fi
-
-exec docker import \
-  -c "EXPOSE 8080/tcp" \
-  -c "ENTRYPOINT [\"/server\"]" \
-  $(find . -name files.tgz) \
-  gvisor.dev/images/website
diff --git a/website/index.md b/website/index.md
index 84f877d49..c6cd477c2 100644
--- a/website/index.md
+++ b/website/index.md
@@ -5,7 +5,7 @@
       <div class="col-md-6">
         <p>gVisor is an <b>application kernel</b> for <b>containers</b> that provides efficient defense-in-depth anywhere.</p>
         <p style="margin-top: 20px;">
-          <a class="btn" href="/docs/user_guide/quick_start/docker/">Quick start&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
+          <a class="btn" href="/docs/user_guide/install/">Get started&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
           <a class="btn" href="/docs/">Learn More&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
         </p>
       </div>